From 4f598dd9731497b60a2d4e372fc896636eb34b8c Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Thu, 22 Jun 2023 12:58:07 +0200
Subject: [PATCH 01/43] Initial working stuff

---
 .gitmodules         |   3 +
 CMakeLists.txt      |  18 ++
 CMakeLists.txt.user | 454 ++++++++++++++++++++++++++++++++++++++++++++
 ggml-vulkan.cpp     | 151 +++++++++++++++
 ggml-vulkan.h       |  13 ++
 ggml.c              |   4 +-
 kompute             |   1 +
 7 files changed, 643 insertions(+), 1 deletion(-)
 create mode 100644 .gitmodules
 create mode 100644 CMakeLists.txt.user
 create mode 100644 ggml-vulkan.cpp
 create mode 100644 ggml-vulkan.h
 create mode 160000 kompute
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000..4a068a6982090
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "kompute"]
+	path = kompute
+	url = https://github.com/KomputeProject/kompute.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc7560a7ae54e..cae41110944e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,7 @@ set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA
 option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv CUDA kernels"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
+option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 
@@ -309,6 +310,22 @@ if (LLAMA_CLBLAST)
     endif()
 endif()
 
+if (LLAMA_KOMPUTE)
+    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
+        message(STATUS "Kompute found")
+
+        add_subdirectory(kompute)
+
+        set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ggml-vulkan.h)
+
+        add_compile_definitions(GGML_USE_KOMPUTE)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
+    else()
+        message(WARNING "Kompute not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(c_flags
@@ -466,6 +483,7 @@ add_library(ggml OBJECT
             ggml.h
             ${GGML_SOURCES_CUDA}
             ${GGML_SOURCES_OPENCL}
+            ${GGML_SOURCES_KOMPUTE}
             ${GGML_SOURCES_METAL}
             ${GGML_SOURCES_EXTRA}
             )
diff --git a/CMakeLists.txt.user b/CMakeLists.txt.user
new file mode 100644
index 0000000000000..e7d3738278eb2
--- /dev/null
+++ b/CMakeLists.txt.user
@@ -0,0 +1,454 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE QtCreatorProject>
+<!-- Written by QtCreator 9.0.2, 2023-06-22T12:36:05. -->
+<qtcreator>
+ <data>
+  <variable>EnvironmentId</variable>
+  <value type="QByteArray">{f3929b0b-3d39-4fa3-8d2d-2b329b63b30c}</value>
+ </data>
+ <data>
+  <variable>ProjectExplorer.Project.ActiveTarget</variable>
+  <value type="qlonglong">0</value>
+ </data>
+ <data>
+  <variable>ProjectExplorer.Project.EditorSettings</variable>
+  <valuemap type="QVariantMap">
+   <value type="bool" key="EditorConfiguration.AutoIndent">true</value>
+   <value type="bool" key="EditorConfiguration.AutoSpacesForTabs">false</value>
+   <value type="bool" key="EditorConfiguration.CamelCaseNavigation">true</value>
+   <valuemap type="QVariantMap" key="EditorConfiguration.CodeStyle.0">
+    <value type="QString" key="language">Cpp</value>
+    <valuemap type="QVariantMap" key="value">
+     <value type="QByteArray" key="CurrentPreferences">CppGlobal</value>
+    </valuemap>
+   </valuemap>
+   <valuemap type="QVariantMap" key="EditorConfiguration.CodeStyle.1">
+    <value type="QString" key="language">QmlJS</value>
+    <valuemap type="QVariantMap" key="value">
+     <value type="QByteArray" key="CurrentPreferences">QmlJSGlobal</value>
+    </valuemap>
+   </valuemap>
+   <value type="qlonglong" key="EditorConfiguration.CodeStyle.Count">2</value>
+   <value type="QByteArray" key="EditorConfiguration.Codec">UTF-8</value>
+   <value type="bool" key="EditorConfiguration.ConstrainTooltips">false</value>
+   <value type="int" key="EditorConfiguration.IndentSize">4</value>
+   <value type="bool" key="EditorConfiguration.KeyboardTooltips">false</value>
+   <value type="int" key="EditorConfiguration.MarginColumn">80</value>
+   <value type="bool" key="EditorConfiguration.MouseHiding">true</value>
+   <value type="bool" key="EditorConfiguration.MouseNavigation">true</value>
+   <value type="int" key="EditorConfiguration.PaddingMode">1</value>
+   <value type="bool" key="EditorConfiguration.PreferSingleLineComments">false</value>
+   <value type="bool" key="EditorConfiguration.ScrollWheelZooming">true</value>
+   <value type="bool" key="EditorConfiguration.ShowMargin">false</value>
+   <value type="int" key="EditorConfiguration.SmartBackspaceBehavior">0</value>
+   <value type="bool" key="EditorConfiguration.SmartSelectionChanging">true</value>
+   <value type="bool" key="EditorConfiguration.SpacesForTabs">true</value>
+   <value type="int" key="EditorConfiguration.TabKeyBehavior">0</value>
+   <value type="int" key="EditorConfiguration.TabSize">8</value>
+   <value type="bool" key="EditorConfiguration.UseGlobal">true</value>
+   <value type="bool" key="EditorConfiguration.UseIndenter">false</value>
+   <value type="int" key="EditorConfiguration.Utf8BomBehavior">1</value>
+   <value type="bool" key="EditorConfiguration.addFinalNewLine">true</value>
+   <value type="bool" key="EditorConfiguration.cleanIndentation">true</value>
+   <value type="bool" key="EditorConfiguration.cleanWhitespace">true</value>
+   <value type="QString" key="EditorConfiguration.ignoreFileTypes">*.md, *.MD, Makefile</value>
+   <value type="bool" key="EditorConfiguration.inEntireDocument">false</value>
+   <value type="bool" key="EditorConfiguration.skipTrailingWhitespace">true</value>
+   <value type="bool" key="EditorConfiguration.tintMarginArea">true</value>
+  </valuemap>
+ </data>
+ <data>
+  <variable>ProjectExplorer.Project.PluginSettings</variable>
+  <valuemap type="QVariantMap">
+   <valuemap type="QVariantMap" key="AutoTest.ActiveFrameworks">
+    <value type="bool" key="AutoTest.Framework.Boost">true</value>
+    <value type="bool" key="AutoTest.Framework.CTest">false</value>
+    <value type="bool" key="AutoTest.Framework.Catch">true</value>
+    <value type="bool" key="AutoTest.Framework.GTest">true</value>
+    <value type="bool" key="AutoTest.Framework.QtQuickTest">true</value>
+    <value type="bool" key="AutoTest.Framework.QtTest">true</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="AutoTest.CheckStates"/>
+   <value type="int" key="AutoTest.RunAfterBuild">0</value>
+   <value type="bool" key="AutoTest.UseGlobal">true</value>
+   <valuemap type="QVariantMap" key="ClangTools">
+    <value type="bool" key="ClangTools.AnalyzeOpenFiles">true</value>
+    <value type="bool" key="ClangTools.BuildBeforeAnalysis">true</value>
+    <value type="QString" key="ClangTools.DiagnosticConfig">Builtin.DefaultTidyAndClazy</value>
+    <value type="int" key="ClangTools.ParallelJobs">6</value>
+    <valuelist type="QVariantList" key="ClangTools.SelectedDirs"/>
+    <valuelist type="QVariantList" key="ClangTools.SelectedFiles"/>
+    <valuelist type="QVariantList" key="ClangTools.SuppressedDiagnostics"/>
+    <value type="bool" key="ClangTools.UseGlobalSettings">true</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="CppEditor.QuickFix">
+    <value type="bool" key="UseGlobalSettings">true</value>
+   </valuemap>
+  </valuemap>
+ </data>
+ <data>
+  <variable>ProjectExplorer.Project.Target.0</variable>
+  <valuemap type="QVariantMap">
+   <value type="QString" key="DeviceType">Desktop</value>
+   <value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">Clang</value>
+   <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Clang</value>
+   <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">{913660d6-ca1c-4b66-a4da-64108a3258a2}</value>
+   <value type="qlonglong" key="ProjectExplorer.Target.ActiveBuildConfiguration">0</value>
+   <value type="qlonglong" key="ProjectExplorer.Target.ActiveDeployConfiguration">0</value>
+   <value type="qlonglong" key="ProjectExplorer.Target.ActiveRunConfiguration">0</value>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.BuildConfiguration.0">
+    <value type="QString" key="CMake.Build.Type">Release</value>
+    <value type="bool" key="CMake.Configure.ClearSystemEnvironment">false</value>
+    <valuelist type="QVariantList" key="CMake.Configure.UserEnvironmentChanges"/>
+    <value type="QString" key="CMake.Initial.Parameters">-DCMAKE_GENERATOR:STRING=Unix Makefiles
+-DCMAKE_BUILD_TYPE:STRING=Release
+-DQT_QMAKE_EXECUTABLE:STRING=%{Qt:qmakeExecutable}
+-DCMAKE_PREFIX_PATH:STRING=%{Qt:QT_INSTALL_PREFIX}
+-DCMAKE_C_COMPILER:STRING=%{Compiler:Executable:C}
+-DCMAKE_CXX_COMPILER:STRING=%{Compiler:Executable:Cxx}
+-DCMAKE_CXX_FLAGS_INIT:STRING=%{Qt:QML_DEBUG_FLAG}</value>
+    <value type="QString" key="ProjectExplorer.BuildConfiguration.BuildDirectory">/mnt/hhdd/Programme/OSS/llama.cpp/../build-llama.cpp-Clang-Release</value>
+    <valuemap type="QVariantMap" key="ProjectExplorer.BuildConfiguration.BuildStepList.0">
+     <valuemap type="QVariantMap" key="ProjectExplorer.BuildStepList.Step.0">
+      <value type="QString" key="CMakeProjectManager.MakeStep.BuildPreset"></value>
+      <valuelist type="QVariantList" key="CMakeProjectManager.MakeStep.BuildTargets">
+       <value type="QString">all</value>
+      </valuelist>
+      <value type="bool" key="CMakeProjectManager.MakeStep.ClearSystemEnvironment">false</value>
+      <valuelist type="QVariantList" key="CMakeProjectManager.MakeStep.UserEnvironmentChanges"/>
+      <value type="bool" key="ProjectExplorer.BuildStep.Enabled">true</value>
+      <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Build</value>
+      <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.MakeStep</value>
+     </valuemap>
+     <value type="qlonglong" key="ProjectExplorer.BuildStepList.StepsCount">1</value>
+     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">Build</value>
+     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Build</value>
+     <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">ProjectExplorer.BuildSteps.Build</value>
+    </valuemap>
+    <valuemap type="QVariantMap" key="ProjectExplorer.BuildConfiguration.BuildStepList.1">
+     <valuemap type="QVariantMap" key="ProjectExplorer.BuildStepList.Step.0">
+      <value type="QString" key="CMakeProjectManager.MakeStep.BuildPreset"></value>
+      <valuelist type="QVariantList" key="CMakeProjectManager.MakeStep.BuildTargets">
+       <value type="QString">clean</value>
+      </valuelist>
+      <value type="bool" key="CMakeProjectManager.MakeStep.ClearSystemEnvironment">false</value>
+      <valuelist type="QVariantList" key="CMakeProjectManager.MakeStep.UserEnvironmentChanges"/>
+      <value type="bool" key="ProjectExplorer.BuildStep.Enabled">true</value>
+      <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Build</value>
+      <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.MakeStep</value>
+     </valuemap>
+     <value type="qlonglong" key="ProjectExplorer.BuildStepList.StepsCount">1</value>
+     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">Clean</value>
+     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Clean</value>
+     <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">ProjectExplorer.BuildSteps.Clean</value>
+    </valuemap>
+    <value type="int" key="ProjectExplorer.BuildConfiguration.BuildStepListCount">2</value>
+    <value type="bool" key="ProjectExplorer.BuildConfiguration.ClearSystemEnvironment">false</value>
+    <valuelist type="QVariantList" key="ProjectExplorer.BuildConfiguration.CustomParsers"/>
+    <value type="bool" key="ProjectExplorer.BuildConfiguration.ParseStandardOutput">false</value>
+    <valuelist type="QVariantList" key="ProjectExplorer.BuildConfiguration.UserEnvironmentChanges"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Release</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeBuildConfiguration</value>
+   </valuemap>
+   <value type="qlonglong" key="ProjectExplorer.Target.BuildConfigurationCount">1</value>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.DeployConfiguration.0">
+    <valuemap type="QVariantMap" key="ProjectExplorer.BuildConfiguration.BuildStepList.0">
+     <value type="qlonglong" key="ProjectExplorer.BuildStepList.StepsCount">0</value>
+     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">Deploy</value>
+     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Deploy</value>
+     <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">ProjectExplorer.BuildSteps.Deploy</value>
+    </valuemap>
+    <value type="int" key="ProjectExplorer.BuildConfiguration.BuildStepListCount">1</value>
+    <valuemap type="QVariantMap" key="ProjectExplorer.DeployConfiguration.CustomData"/>
+    <value type="bool" key="ProjectExplorer.DeployConfiguration.CustomDataEnabled">false</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">ProjectExplorer.DefaultDeployConfiguration</value>
+   </valuemap>
+   <value type="qlonglong" key="ProjectExplorer.Target.DeployConfigurationCount">1</value>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.0">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">baby-llama</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.baby-llama</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">baby-llama</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.1">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">benchmark</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.benchmark</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">benchmark</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.10">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">test-quantize-perf</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.test-quantize-perf</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">test-quantize-perf</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.11">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">test-sampling</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.test-sampling</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">test-sampling</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.12">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">test-tokenizer-0</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.test-tokenizer-0</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">test-tokenizer-0</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.13">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">train-text-from-scratch</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.train-text-from-scratch</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">train-text-from-scratch</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.14">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">vdot</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.vdot</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">vdot</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.15">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">simple</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.simple</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">simple</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.2">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">embedding</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.embedding</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">embedding</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.3">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">main</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.main</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">main</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.4">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">perplexity</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.perplexity</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">perplexity</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.5">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">q8dot</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.q8dot</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">q8dot</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.6">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">quantize</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.quantize</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">quantize</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.7">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">quantize-stats</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.quantize-stats</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">quantize-stats</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.8">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">save-load-state</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.save-load-state</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">save-load-state</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.9">
+    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
+    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
+    <valuelist type="QVariantList" key="CustomOutputParsers"/>
+    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
+    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">test-quantize-fns</value>
+    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.test-quantize-fns</value>
+    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">test-quantize-fns</value>
+    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
+    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
+    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
+    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
+   </valuemap>
+   <value type="qlonglong" key="ProjectExplorer.Target.RunConfigurationCount">16</value>
+  </valuemap>
+ </data>
+ <data>
+  <variable>ProjectExplorer.Project.TargetCount</variable>
+  <value type="qlonglong">1</value>
+ </data>
+ <data>
+  <variable>ProjectExplorer.Project.Updater.FileVersion</variable>
+  <value type="int">22</value>
+ </data>
+ <data>
+  <variable>Version</variable>
+  <value type="int">22</value>
+ </data>
+</qtcreator>
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
new file mode 100644
index 0000000000000..45f426a2fadd0
--- /dev/null
+++ b/ggml-vulkan.cpp
@@ -0,0 +1,151 @@
+#include "ggml-vulkan.h"
+#include "ggml.h"
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <memory>
+#include <cstring>
+#include <fstream>
+#include <kompute/Kompute.hpp>
+
+typedef ggml_fp16_t half;
+
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+#define STRINGIFY(x) STRINGIFY2(x)
+#define STRINGIFY2(x) #x
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+typedef struct {
+    half    d;
+    uint8_t qs[QK4_0 / 2];
+} block_q4_0;
+
+typedef struct {
+    half d;
+    half m;
+    uint8_t qs[QK4_1 / 2];
+} block_q4_1;
+
+
+kp::Manager mgr;
+
+
+
+static const std::string program_source_head = R"(
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: enable
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+layout (local_size_x = 1) in;
+)";
+
+
+static const std::string kernel_dequantize_row_q4_0 =
+        program_source_head+'\n'+MULTILINE_QUOTE(
+// Tensors
+layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
+layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
+layout(binding = 2) buffer tensorY { float y[]; };
+
+// Push constants
+layout(push_constant) uniform PushConstants {
+    int k;
+} pcs;
+
+void main() {
+    const int qk = QK4_0;
+
+    const int i = int(gl_GlobalInvocationID.x);
+    const int j = int(gl_GlobalInvocationID.y);
+
+    const float16_t d = x_d[i];
+    const uint8_t qs = x_qs[i * (QK4_0 / 2) + j];
+
+    const int x0 = (qs & 0x0F) - 8;
+    const int x1 = (qs >>   4) - 8;
+
+    y[i*qk + j + 0   ] = float16_t(x0)*d;
+    y[i*qk + j + qk/2] = float16_t(x1)*d;
+}
+);
+
+
+std::vector<uint32_t> compileSource(const std::string& source) {
+    //FIXME: Terrible solution!!!!
+    std::ofstream fileOut("tmp_kp_shader.comp");
+    fileOut << source;
+    fileOut.close();
+    if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str()))
+        throw std::runtime_error("Error running glslangValidator command");
+    std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
+    std::vector<char> buffer;
+    buffer.insert(buffer.begin(), std::istreambuf_iterator<char>(fileStream), {});
+    return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
+}
+
+void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
+    static const int qk = QK4_0;
+    static const unsigned nb = k / qk;
+    static const unsigned y_size = qk*2*nb;
+    const static auto spirv = compileSource(kernel_dequantize_row_q4_0);
+
+    const auto x = reinterpret_cast<const block_q4_0*>(x_);
+
+    auto getVecBlockQ4_0D = [] (const block_q4_0 *x) {
+        std::vector<half> fres;
+        fres.reserve(nb);
+        for (unsigned it = 0; it != nb; it++) {
+            fres.push_back(x[it].d);
+        }
+        return fres;
+    };
+    auto getVecBlockQ4_0QS = [] (const block_q4_0 *x) {
+        std::vector<uint8_t> fres;
+        fres.resize(nb*(qk/2));
+        for (unsigned x_it = 0; x_it != nb; x_it++) {
+            for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) {
+                fres.push_back(x[x_it].qs[qs_it]);
+            }
+        }
+        return fres;
+    };
+
+    const auto tensorBlockQ4_0D = mgr.tensorT<half>(getVecBlockQ4_0D(x));
+    const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(getVecBlockQ4_0QS(x));
+    const auto tensorY = mgr.tensor(std::vector<float>(y, y+y_size));
+
+    struct PushConsts {
+        int k;
+    } pushConsts {
+        k
+    };
+
+    mgr.sequence()
+            ->record<kp::OpTensorSyncDevice>({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY})
+            ->record<kp::OpAlgoDispatch>(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0}, {}, {0}), std::vector<PushConsts>{pushConsts})
+            ->record<kp::OpTensorSyncLocal>({tensorY})
+            ->eval();
+
+    std::memcpy(y, tensorY->data(), tensorY->size());
+}
+
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT<half>::dataType()
+{
+    return TensorDataTypes::eFloat;
+}
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT<uint8_t>::dataType()
+{
+    return TensorDataTypes::eUnsignedInt;
+}
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
new file mode 100644
index 0000000000000..34e6d46b3dbfa
--- /dev/null
+++ b/ggml-vulkan.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void ggml_vk_init(void);
+
+void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml.c b/ggml.c
index 4319683f5186e..151b9eefbf4b7 100644
--- a/ggml.c
+++ b/ggml.c
@@ -161,6 +161,8 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #endif
 #elif defined(GGML_USE_OPENBLAS)
 #include <cblas.h>
+#elif defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
 #elif defined(GGML_USE_CUBLAS)
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
@@ -1548,7 +1550,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
 
 static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = {
-        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_0,
+        .dequantize_row_q         = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_0,
         .quantize_row_q           = quantize_row_q4_0,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
         .quantize_row_q_dot       = quantize_row_q8_0,
diff --git a/kompute b/kompute
new file mode 160000
index 0000000000000..63567a72be6b2
--- /dev/null
+++ b/kompute
@@ -0,0 +1 @@
+Subproject commit 63567a72be6b26f79da92becaffa7cd55f46642b

From 2f3fe0c0a45b6c5130ce6b3dd2cf82dcd9eb8a2e Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Thu, 22 Jun 2023 12:58:33 +0200
Subject: [PATCH 02/43] Updated gitignore

---
 .gitignore          |   2 +
 CMakeLists.txt.user | 454 --------------------------------------------
 2 files changed, 2 insertions(+), 454 deletions(-)
 delete mode 100644 CMakeLists.txt.user

diff --git a/.gitignore b/.gitignore
index e7bfd52e3d63c..e88b7d83ad2f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,3 +56,5 @@ qnt-*.txt
 perf-*.txt
 
 examples/jeopardy/results.txt
+
+CMakeLists.txt.user*
diff --git a/CMakeLists.txt.user b/CMakeLists.txt.user
deleted file mode 100644
index e7d3738278eb2..0000000000000
--- a/CMakeLists.txt.user
+++ /dev/null
@@ -1,454 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE QtCreatorProject>
-<!-- Written by QtCreator 9.0.2, 2023-06-22T12:36:05. -->
-<qtcreator>
- <data>
-  <variable>EnvironmentId</variable>
-  <value type="QByteArray">{f3929b0b-3d39-4fa3-8d2d-2b329b63b30c}</value>
- </data>
- <data>
-  <variable>ProjectExplorer.Project.ActiveTarget</variable>
-  <value type="qlonglong">0</value>
- </data>
- <data>
-  <variable>ProjectExplorer.Project.EditorSettings</variable>
-  <valuemap type="QVariantMap">
-   <value type="bool" key="EditorConfiguration.AutoIndent">true</value>
-   <value type="bool" key="EditorConfiguration.AutoSpacesForTabs">false</value>
-   <value type="bool" key="EditorConfiguration.CamelCaseNavigation">true</value>
-   <valuemap type="QVariantMap" key="EditorConfiguration.CodeStyle.0">
-    <value type="QString" key="language">Cpp</value>
-    <valuemap type="QVariantMap" key="value">
-     <value type="QByteArray" key="CurrentPreferences">CppGlobal</value>
-    </valuemap>
-   </valuemap>
-   <valuemap type="QVariantMap" key="EditorConfiguration.CodeStyle.1">
-    <value type="QString" key="language">QmlJS</value>
-    <valuemap type="QVariantMap" key="value">
-     <value type="QByteArray" key="CurrentPreferences">QmlJSGlobal</value>
-    </valuemap>
-   </valuemap>
-   <value type="qlonglong" key="EditorConfiguration.CodeStyle.Count">2</value>
-   <value type="QByteArray" key="EditorConfiguration.Codec">UTF-8</value>
-   <value type="bool" key="EditorConfiguration.ConstrainTooltips">false</value>
-   <value type="int" key="EditorConfiguration.IndentSize">4</value>
-   <value type="bool" key="EditorConfiguration.KeyboardTooltips">false</value>
-   <value type="int" key="EditorConfiguration.MarginColumn">80</value>
-   <value type="bool" key="EditorConfiguration.MouseHiding">true</value>
-   <value type="bool" key="EditorConfiguration.MouseNavigation">true</value>
-   <value type="int" key="EditorConfiguration.PaddingMode">1</value>
-   <value type="bool" key="EditorConfiguration.PreferSingleLineComments">false</value>
-   <value type="bool" key="EditorConfiguration.ScrollWheelZooming">true</value>
-   <value type="bool" key="EditorConfiguration.ShowMargin">false</value>
-   <value type="int" key="EditorConfiguration.SmartBackspaceBehavior">0</value>
-   <value type="bool" key="EditorConfiguration.SmartSelectionChanging">true</value>
-   <value type="bool" key="EditorConfiguration.SpacesForTabs">true</value>
-   <value type="int" key="EditorConfiguration.TabKeyBehavior">0</value>
-   <value type="int" key="EditorConfiguration.TabSize">8</value>
-   <value type="bool" key="EditorConfiguration.UseGlobal">true</value>
-   <value type="bool" key="EditorConfiguration.UseIndenter">false</value>
-   <value type="int" key="EditorConfiguration.Utf8BomBehavior">1</value>
-   <value type="bool" key="EditorConfiguration.addFinalNewLine">true</value>
-   <value type="bool" key="EditorConfiguration.cleanIndentation">true</value>
-   <value type="bool" key="EditorConfiguration.cleanWhitespace">true</value>
-   <value type="QString" key="EditorConfiguration.ignoreFileTypes">*.md, *.MD, Makefile</value>
-   <value type="bool" key="EditorConfiguration.inEntireDocument">false</value>
-   <value type="bool" key="EditorConfiguration.skipTrailingWhitespace">true</value>
-   <value type="bool" key="EditorConfiguration.tintMarginArea">true</value>
-  </valuemap>
- </data>
- <data>
-  <variable>ProjectExplorer.Project.PluginSettings</variable>
-  <valuemap type="QVariantMap">
-   <valuemap type="QVariantMap" key="AutoTest.ActiveFrameworks">
-    <value type="bool" key="AutoTest.Framework.Boost">true</value>
-    <value type="bool" key="AutoTest.Framework.CTest">false</value>
-    <value type="bool" key="AutoTest.Framework.Catch">true</value>
-    <value type="bool" key="AutoTest.Framework.GTest">true</value>
-    <value type="bool" key="AutoTest.Framework.QtQuickTest">true</value>
-    <value type="bool" key="AutoTest.Framework.QtTest">true</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="AutoTest.CheckStates"/>
-   <value type="int" key="AutoTest.RunAfterBuild">0</value>
-   <value type="bool" key="AutoTest.UseGlobal">true</value>
-   <valuemap type="QVariantMap" key="ClangTools">
-    <value type="bool" key="ClangTools.AnalyzeOpenFiles">true</value>
-    <value type="bool" key="ClangTools.BuildBeforeAnalysis">true</value>
-    <value type="QString" key="ClangTools.DiagnosticConfig">Builtin.DefaultTidyAndClazy</value>
-    <value type="int" key="ClangTools.ParallelJobs">6</value>
-    <valuelist type="QVariantList" key="ClangTools.SelectedDirs"/>
-    <valuelist type="QVariantList" key="ClangTools.SelectedFiles"/>
-    <valuelist type="QVariantList" key="ClangTools.SuppressedDiagnostics"/>
-    <value type="bool" key="ClangTools.UseGlobalSettings">true</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="CppEditor.QuickFix">
-    <value type="bool" key="UseGlobalSettings">true</value>
-   </valuemap>
-  </valuemap>
- </data>
- <data>
-  <variable>ProjectExplorer.Project.Target.0</variable>
-  <valuemap type="QVariantMap">
-   <value type="QString" key="DeviceType">Desktop</value>
-   <value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">Clang</value>
-   <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Clang</value>
-   <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">{913660d6-ca1c-4b66-a4da-64108a3258a2}</value>
-   <value type="qlonglong" key="ProjectExplorer.Target.ActiveBuildConfiguration">0</value>
-   <value type="qlonglong" key="ProjectExplorer.Target.ActiveDeployConfiguration">0</value>
-   <value type="qlonglong" key="ProjectExplorer.Target.ActiveRunConfiguration">0</value>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.BuildConfiguration.0">
-    <value type="QString" key="CMake.Build.Type">Release</value>
-    <value type="bool" key="CMake.Configure.ClearSystemEnvironment">false</value>
-    <valuelist type="QVariantList" key="CMake.Configure.UserEnvironmentChanges"/>
-    <value type="QString" key="CMake.Initial.Parameters">-DCMAKE_GENERATOR:STRING=Unix Makefiles
--DCMAKE_BUILD_TYPE:STRING=Release
--DQT_QMAKE_EXECUTABLE:STRING=%{Qt:qmakeExecutable}
--DCMAKE_PREFIX_PATH:STRING=%{Qt:QT_INSTALL_PREFIX}
--DCMAKE_C_COMPILER:STRING=%{Compiler:Executable:C}
--DCMAKE_CXX_COMPILER:STRING=%{Compiler:Executable:Cxx}
--DCMAKE_CXX_FLAGS_INIT:STRING=%{Qt:QML_DEBUG_FLAG}</value>
-    <value type="QString" key="ProjectExplorer.BuildConfiguration.BuildDirectory">/mnt/hhdd/Programme/OSS/llama.cpp/../build-llama.cpp-Clang-Release</value>
-    <valuemap type="QVariantMap" key="ProjectExplorer.BuildConfiguration.BuildStepList.0">
-     <valuemap type="QVariantMap" key="ProjectExplorer.BuildStepList.Step.0">
-      <value type="QString" key="CMakeProjectManager.MakeStep.BuildPreset"></value>
-      <valuelist type="QVariantList" key="CMakeProjectManager.MakeStep.BuildTargets">
-       <value type="QString">all</value>
-      </valuelist>
-      <value type="bool" key="CMakeProjectManager.MakeStep.ClearSystemEnvironment">false</value>
-      <valuelist type="QVariantList" key="CMakeProjectManager.MakeStep.UserEnvironmentChanges"/>
-      <value type="bool" key="ProjectExplorer.BuildStep.Enabled">true</value>
-      <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Build</value>
-      <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.MakeStep</value>
-     </valuemap>
-     <value type="qlonglong" key="ProjectExplorer.BuildStepList.StepsCount">1</value>
-     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">Build</value>
-     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Build</value>
-     <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">ProjectExplorer.BuildSteps.Build</value>
-    </valuemap>
-    <valuemap type="QVariantMap" key="ProjectExplorer.BuildConfiguration.BuildStepList.1">
-     <valuemap type="QVariantMap" key="ProjectExplorer.BuildStepList.Step.0">
-      <value type="QString" key="CMakeProjectManager.MakeStep.BuildPreset"></value>
-      <valuelist type="QVariantList" key="CMakeProjectManager.MakeStep.BuildTargets">
-       <value type="QString">clean</value>
-      </valuelist>
-      <value type="bool" key="CMakeProjectManager.MakeStep.ClearSystemEnvironment">false</value>
-      <valuelist type="QVariantList" key="CMakeProjectManager.MakeStep.UserEnvironmentChanges"/>
-      <value type="bool" key="ProjectExplorer.BuildStep.Enabled">true</value>
-      <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Build</value>
-      <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.MakeStep</value>
-     </valuemap>
-     <value type="qlonglong" key="ProjectExplorer.BuildStepList.StepsCount">1</value>
-     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">Clean</value>
-     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Clean</value>
-     <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">ProjectExplorer.BuildSteps.Clean</value>
-    </valuemap>
-    <value type="int" key="ProjectExplorer.BuildConfiguration.BuildStepListCount">2</value>
-    <value type="bool" key="ProjectExplorer.BuildConfiguration.ClearSystemEnvironment">false</value>
-    <valuelist type="QVariantList" key="ProjectExplorer.BuildConfiguration.CustomParsers"/>
-    <value type="bool" key="ProjectExplorer.BuildConfiguration.ParseStandardOutput">false</value>
-    <valuelist type="QVariantList" key="ProjectExplorer.BuildConfiguration.UserEnvironmentChanges"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Release</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeBuildConfiguration</value>
-   </valuemap>
-   <value type="qlonglong" key="ProjectExplorer.Target.BuildConfigurationCount">1</value>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.DeployConfiguration.0">
-    <valuemap type="QVariantMap" key="ProjectExplorer.BuildConfiguration.BuildStepList.0">
-     <value type="qlonglong" key="ProjectExplorer.BuildStepList.StepsCount">0</value>
-     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">Deploy</value>
-     <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">Deploy</value>
-     <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">ProjectExplorer.BuildSteps.Deploy</value>
-    </valuemap>
-    <value type="int" key="ProjectExplorer.BuildConfiguration.BuildStepListCount">1</value>
-    <valuemap type="QVariantMap" key="ProjectExplorer.DeployConfiguration.CustomData"/>
-    <value type="bool" key="ProjectExplorer.DeployConfiguration.CustomDataEnabled">false</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">ProjectExplorer.DefaultDeployConfiguration</value>
-   </valuemap>
-   <value type="qlonglong" key="ProjectExplorer.Target.DeployConfigurationCount">1</value>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.0">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">baby-llama</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.baby-llama</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">baby-llama</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.1">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">benchmark</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.benchmark</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">benchmark</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.10">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">test-quantize-perf</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.test-quantize-perf</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">test-quantize-perf</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.11">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">test-sampling</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.test-sampling</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">test-sampling</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.12">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">test-tokenizer-0</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.test-tokenizer-0</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">test-tokenizer-0</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.13">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">train-text-from-scratch</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.train-text-from-scratch</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">train-text-from-scratch</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.14">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">vdot</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.vdot</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">vdot</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.15">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">simple</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.simple</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">simple</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.2">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">embedding</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.embedding</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">embedding</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.3">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">main</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.main</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">main</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.4">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">perplexity</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.perplexity</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">perplexity</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.5">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">q8dot</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.q8dot</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">q8dot</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.6">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">quantize</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.quantize</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">quantize</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.7">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">quantize-stats</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.quantize-stats</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">quantize-stats</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.8">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">save-load-state</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.save-load-state</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">save-load-state</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <valuemap type="QVariantMap" key="ProjectExplorer.Target.RunConfiguration.9">
-    <value type="bool" key="Analyzer.Perf.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.QmlProfiler.Settings.UseGlobalSettings">true</value>
-    <value type="bool" key="Analyzer.Valgrind.Settings.UseGlobalSettings">true</value>
-    <valuelist type="QVariantList" key="CustomOutputParsers"/>
-    <value type="int" key="PE.EnvironmentAspect.Base">2</value>
-    <valuelist type="QVariantList" key="PE.EnvironmentAspect.Changes"/>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName">test-quantize-fns</value>
-    <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">CMakeProjectManager.CMakeRunConfiguration.test-quantize-fns</value>
-    <value type="QString" key="ProjectExplorer.RunConfiguration.BuildKey">test-quantize-fns</value>
-    <value type="bool" key="RunConfiguration.UseCppDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseCppDebuggerAuto">true</value>
-    <value type="bool" key="RunConfiguration.UseLibrarySearchPath">true</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebugger">false</value>
-    <value type="bool" key="RunConfiguration.UseQmlDebuggerAuto">true</value>
-    <value type="QString" key="RunConfiguration.WorkingDirectory.default">/mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin</value>
-   </valuemap>
-   <value type="qlonglong" key="ProjectExplorer.Target.RunConfigurationCount">16</value>
-  </valuemap>
- </data>
- <data>
-  <variable>ProjectExplorer.Project.TargetCount</variable>
-  <value type="qlonglong">1</value>
- </data>
- <data>
-  <variable>ProjectExplorer.Project.Updater.FileVersion</variable>
-  <value type="int">22</value>
- </data>
- <data>
-  <variable>Version</variable>
-  <value type="int">22</value>
- </data>
-</qtcreator>

From 3b3d30e4ade98185b47dd781a7c7b2e82b0353a7 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Thu, 22 Jun 2023 13:55:25 +0200
Subject: [PATCH 03/43] Cleanups

---
 ggml-vulkan.cpp | 37 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 45f426a2fadd0..706a0ffeedd6c 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -9,6 +9,10 @@
 #include <fstream>
 #include <kompute/Kompute.hpp>
 
+#ifndef __STDC_IEC_559__
+#error Your C implementation is not IEC 559 compliant, which is required for proper Vulkan interop.
+#endif
+
 typedef ggml_fp16_t half;
 
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
@@ -53,25 +57,20 @@ layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
 layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
 layout(binding = 2) buffer tensorY { float y[]; };
 
-// Push constants
-layout(push_constant) uniform PushConstants {
-    int k;
-} pcs;
-
 void main() {
     const int qk = QK4_0;
 
     const int i = int(gl_GlobalInvocationID.x);
     const int j = int(gl_GlobalInvocationID.y);
 
-    const float16_t d = x_d[i];
-    const uint8_t qs = x_qs[i * (QK4_0 / 2) + j];
+    const float d = float(x_d[i]);
+    const uint8_t qs = x_qs[i * (qk / 2) + j];
 
     const int x0 = (qs & 0x0F) - 8;
     const int x1 = (qs >>   4) - 8;
 
-    y[i*qk + j + 0   ] = float16_t(x0)*d;
-    y[i*qk + j + qk/2] = float16_t(x1)*d;
+    y[i*qk + j + 0   ] = float(x0)*d;
+    y[i*qk + j + qk/2] = float(x1)*d;
 }
 );
 
@@ -97,20 +96,20 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
 
     const auto x = reinterpret_cast<const block_q4_0*>(x_);
 
+    assert(k % qk == 0);
+
     auto getVecBlockQ4_0D = [] (const block_q4_0 *x) {
-        std::vector<half> fres;
-        fres.reserve(nb);
+        std::vector<half> fres(nb);
         for (unsigned it = 0; it != nb; it++) {
-            fres.push_back(x[it].d);
+            fres[it] = x[it].d;
         }
         return fres;
     };
     auto getVecBlockQ4_0QS = [] (const block_q4_0 *x) {
-        std::vector<uint8_t> fres;
-        fres.resize(nb*(qk/2));
+        std::vector<uint8_t> fres(nb*(qk/2));
         for (unsigned x_it = 0; x_it != nb; x_it++) {
             for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) {
-                fres.push_back(x[x_it].qs[qs_it]);
+                fres[x_it * (qk / 2) + qs_it] = x[x_it].qs[qs_it];
             }
         }
         return fres;
@@ -120,15 +119,9 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
     const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(getVecBlockQ4_0QS(x));
     const auto tensorY = mgr.tensor(std::vector<float>(y, y+y_size));
 
-    struct PushConsts {
-        int k;
-    } pushConsts {
-        k
-    };
-
     mgr.sequence()
             ->record<kp::OpTensorSyncDevice>({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY})
-            ->record<kp::OpAlgoDispatch>(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0}, {}, {0}), std::vector<PushConsts>{pushConsts})
+            ->record<kp::OpAlgoDispatch>(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0}))
             ->record<kp::OpTensorSyncLocal>({tensorY})
             ->eval();
 

From b0f11fa9c181e90c9294f83c16004874db682329 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Thu, 22 Jun 2023 16:05:56 +0200
Subject: [PATCH 04/43] More code cleanups

---
 ggml-vulkan.cpp | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 706a0ffeedd6c..b0a84942e91ff 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -7,6 +7,7 @@
 #include <memory>
 #include <cstring>
 #include <fstream>
+#include <immintrin.h>
 #include <kompute/Kompute.hpp>
 
 #ifndef __STDC_IEC_559__
@@ -39,6 +40,20 @@ kp::Manager mgr;
 
 
 
+std::vector<uint32_t> compileSource(const std::string& source) {
+    //FIXME: Terrible solution!!!!
+    std::ofstream fileOut("tmp_kp_shader.comp");
+    fileOut << source;
+    fileOut.close();
+    if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str()))
+        throw std::runtime_error("Error running glslangValidator command");
+    std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
+    std::vector<char> buffer;
+    buffer.insert(buffer.begin(), std::istreambuf_iterator<char>(fileStream), {});
+    return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
+}
+
+
 static const std::string program_source_head = R"(
 #version 450
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: enable
@@ -50,9 +65,8 @@ layout (local_size_x = 1) in;
 )";
 
 
-static const std::string kernel_dequantize_row_q4_0 =
+static const std::string program_dequantize_row_q4_0 =
         program_source_head+'\n'+MULTILINE_QUOTE(
-// Tensors
 layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
 layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
 layout(binding = 2) buffer tensorY { float y[]; };
@@ -75,37 +89,24 @@ void main() {
 );
 
 
-std::vector<uint32_t> compileSource(const std::string& source) {
-    //FIXME: Terrible solution!!!!
-    std::ofstream fileOut("tmp_kp_shader.comp");
-    fileOut << source;
-    fileOut.close();
-    if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str()))
-        throw std::runtime_error("Error running glslangValidator command");
-    std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
-    std::vector<char> buffer;
-    buffer.insert(buffer.begin(), std::istreambuf_iterator<char>(fileStream), {});
-    return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
-}
-
 void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
     static const int qk = QK4_0;
-    static const unsigned nb = k / qk;
-    static const unsigned y_size = qk*2*nb;
-    const static auto spirv = compileSource(kernel_dequantize_row_q4_0);
+    const unsigned nb = k / qk;
+    const unsigned y_size = nb*qk;
+    const static auto spirv = compileSource(program_dequantize_row_q4_0);
 
     const auto x = reinterpret_cast<const block_q4_0*>(x_);
 
     assert(k % qk == 0);
 
-    auto getVecBlockQ4_0D = [] (const block_q4_0 *x) {
+    auto getVecBlockQ4_0D = [x, nb] () {
         std::vector<half> fres(nb);
         for (unsigned it = 0; it != nb; it++) {
             fres[it] = x[it].d;
         }
         return fres;
     };
-    auto getVecBlockQ4_0QS = [] (const block_q4_0 *x) {
+    auto getVecBlockQ4_0QS = [x, nb] () {
         std::vector<uint8_t> fres(nb*(qk/2));
         for (unsigned x_it = 0; x_it != nb; x_it++) {
             for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) {
@@ -115,8 +116,8 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
         return fres;
     };
 
-    const auto tensorBlockQ4_0D = mgr.tensorT<half>(getVecBlockQ4_0D(x));
-    const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(getVecBlockQ4_0QS(x));
+    const auto tensorBlockQ4_0D = mgr.tensorT<half>(getVecBlockQ4_0D());
+    const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(getVecBlockQ4_0QS());
     const auto tensorY = mgr.tensor(std::vector<float>(y, y+y_size));
 
     mgr.sequence()
@@ -125,7 +126,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
             ->record<kp::OpTensorSyncLocal>({tensorY})
             ->eval();
 
-    std::memcpy(y, tensorY->data(), tensorY->size());
+    std::memcpy(y, tensorY->data(), tensorY->size()*sizeof(*y));
 }
 
 

From 9cdaea9240c8ea21f4eed8ab7f7248ac19844022 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Thu, 22 Jun 2023 16:30:36 +0200
Subject: [PATCH 05/43] Implemented dequantize_row_q4_1

---
 ggml-vulkan.cpp | 101 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 81 insertions(+), 20 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index b0a84942e91ff..c722609a906c0 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -53,6 +53,35 @@ std::vector<uint32_t> compileSource(const std::string& source) {
     return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
 }
 
+template<class T>
+std::vector<half> getVecBlockQ4_0D(T *x, unsigned nb) {
+    std::vector<half> fres(nb);
+    for (unsigned it = 0; it != nb; it++) {
+        fres[it] = x[it].d;
+    }
+    return fres;
+}
+
+template<class T>
+std::vector<half> getVecBlockQ4_0M(T *x, unsigned nb) {
+    std::vector<half> fres(nb);
+    for (unsigned it = 0; it != nb; it++) {
+        fres[it] = x[it].m;
+    }
+    return fres;
+}
+
+template<class T>
+std::vector<uint8_t> getVecBlockQ4_0QS(T *x, unsigned nb, unsigned qk) {
+    std::vector<uint8_t> fres(nb*(qk/2));
+    for (unsigned x_it = 0; x_it != nb; x_it++) {
+        for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) {
+            fres[x_it * (qk / 2) + qs_it] = x[x_it].qs[qs_it];
+        }
+    }
+    return fres;
+};
+
 
 static const std::string program_source_head = R"(
 #version 450
@@ -88,7 +117,6 @@ void main() {
 }
 );
 
-
 void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
     static const int qk = QK4_0;
     const unsigned nb = k / qk;
@@ -99,25 +127,8 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
 
     assert(k % qk == 0);
 
-    auto getVecBlockQ4_0D = [x, nb] () {
-        std::vector<half> fres(nb);
-        for (unsigned it = 0; it != nb; it++) {
-            fres[it] = x[it].d;
-        }
-        return fres;
-    };
-    auto getVecBlockQ4_0QS = [x, nb] () {
-        std::vector<uint8_t> fres(nb*(qk/2));
-        for (unsigned x_it = 0; x_it != nb; x_it++) {
-            for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) {
-                fres[x_it * (qk / 2) + qs_it] = x[x_it].qs[qs_it];
-            }
-        }
-        return fres;
-    };
-
-    const auto tensorBlockQ4_0D = mgr.tensorT<half>(getVecBlockQ4_0D());
-    const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(getVecBlockQ4_0QS());
+    const auto tensorBlockQ4_0D = mgr.tensorT<half>(getVecBlockQ4_0D(x, nb));
+    const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(getVecBlockQ4_0QS(x, nb, qk));
     const auto tensorY = mgr.tensor(std::vector<float>(y, y+y_size));
 
     mgr.sequence()
@@ -130,6 +141,56 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
 }
 
 
+static const std::string program_dequantize_row_q4_1 =
+        program_source_head+'\n'+MULTILINE_QUOTE(
+layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
+layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; };
+layout(binding = 2) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
+layout(binding = 3) buffer tensorY { float y[]; };
+
+void main() {
+    const int qk = QK4_1;
+
+    const int i = int(gl_GlobalInvocationID.x);
+    const int j = int(gl_GlobalInvocationID.y);
+
+    const float d = float(x_d[i]);
+    const float m = float(x_m[i]);
+    const uint8_t qs = x_qs[i * (qk / 2) + j];
+
+    const int x0 = (qs & 0x0F);
+    const int x1 = (qs >>   4);
+
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
+}
+);
+
+void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) {
+    static const int qk = QK4_1;
+    const unsigned nb = k / qk;
+    const unsigned y_size = nb*qk;
+    const static auto spirv = compileSource(program_dequantize_row_q4_1);
+
+    const auto x = reinterpret_cast<const block_q4_1*>(x_);
+
+    assert(k % qk == 0);
+
+    const auto tensorBlockQ4_0D = mgr.tensorT<half>(getVecBlockQ4_0D(x, nb));
+    const auto tensorBlockQ4_0M = mgr.tensorT<half>(getVecBlockQ4_0M(x, nb));
+    const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(getVecBlockQ4_0QS(x, nb, qk));
+    const auto tensorY = mgr.tensor(std::vector<float>(y, y+y_size));
+
+    mgr.sequence()
+            ->record<kp::OpTensorSyncDevice>({tensorBlockQ4_0D, tensorBlockQ4_0M, tensorBlockQ4_0QS, tensorY})
+            ->record<kp::OpAlgoDispatch>(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0M, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0}))
+            ->record<kp::OpTensorSyncLocal>({tensorY})
+            ->eval();
+
+    std::memcpy(y, tensorY->data(), tensorY->size()*sizeof(*y));
+}
+
+
 template<>
 kp::Tensor::TensorDataTypes
 kp::TensorT<half>::dataType()

From 339bc36cdda3014a80c45051ca89bf982e76f750 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 11:50:30 +0200
Subject: [PATCH 06/43] Added more functions from Metal

---
 ggml-vulkan.cpp | 142 ++++++++++++++++++++++++++++++++++++++++++++++--
 ggml-vulkan.h   |  26 ++++++++-
 llama.cpp       |  60 ++++++++++++++++++++
 3 files changed, 222 insertions(+), 6 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index c722609a906c0..b7e70e221a04c 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -5,8 +5,10 @@
 #include <vector>
 #include <string>
 #include <memory>
-#include <cstring>
+#include <unordered_map>
 #include <fstream>
+#include <exception>
+#include <cstring>
 #include <immintrin.h>
 #include <kompute/Kompute.hpp>
 
@@ -14,8 +16,6 @@
 #error Your C implementation is not IEC 559 compliant, which is required for proper Vulkan interop.
 #endif
 
-typedef ggml_fp16_t half;
-
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
 #define STRINGIFY(x) STRINGIFY2(x)
 #define STRINGIFY2(x) #x
@@ -24,6 +24,10 @@ typedef ggml_fp16_t half;
 #define QR4_0 2
 #define QK4_1 32
 
+
+typedef ggml_fp16_t half;
+enum class byte : unsigned char {};
+
 typedef struct {
     half    d;
     uint8_t qs[QK4_0 / 2];
@@ -35,12 +39,82 @@ typedef struct {
     uint8_t qs[QK4_1 / 2];
 } block_q4_1;
 
+struct ggml_kompute_context {
+    std::unordered_map<const char *, std::shared_ptr<kp::Tensor>> buffers;
+    std::unordered_map<struct ggml_tensor *, std::shared_ptr<kp::Tensor>> tensors;
+};
+
 
 kp::Manager mgr;
 
 
+ggml_kompute_context *ggml_vk_init() {
+    return new ggml_kompute_context;
+}
+
+void ggml_metal_free(struct ggml_kompute_context * ctx) {
+    delete ctx;
+}
+
+
+bool ggml_vk_add_buffer(
+      struct ggml_kompute_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size,
+                           size_t   max_size) {
+    try {
+        std::vector<byte> vec(max_size);
+        std::memcpy(vec.data(), data, std::max(size, max_size));
+        auto tensor = mgr.tensorT<byte>(vec);
+        ctx->buffers.emplace(name, std::move(tensor));
+    } catch (const std::exception & e) {
+        fprintf(stderr, "ggml_vk: failed to add buffer '%s': %s\n", name, e.what());
+        return false;
+    }
+    return true;
+}
+
+std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) {
+    auto res = ctx->buffers.find(name);
+    if (res == ctx->buffers.end()) return nullptr;
+    return res->second;
+}
+
+
+void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    if (t->backend != GGML_BACKEND_GPU) {
+        return;
+    }
+
+    auto data = t->data;
+    auto size = ggml_nbytes(t);
+
+    std::vector<byte> vec(size);
+    memcpy(vec.data(), data, size);
+
+    auto tensor = mgr.tensorT<byte>(vec);
+    mgr.sequence()->eval<kp::OpTensorSyncDevice>({tensor});
+    ctx->tensors.emplace(t, std::move(tensor));
+}
+
+void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    if (t->backend != GGML_BACKEND_GPU) {
+        return;
+    }
+
+    auto data = t->data;
+    auto size = ggml_nbytes(t);
+
+    auto res = ctx->tensors.find(t);
+
+    auto tensor = res->second;
+    mgr.sequence()->eval<kp::OpTensorSyncLocal>({tensor});
+    memcpy(data, tensor->data<void>(), size);
+}
+
 
-std::vector<uint32_t> compileSource(const std::string& source) {
+static std::vector<uint32_t> compileSource(const std::string& source) {
     //FIXME: Terrible solution!!!!
     std::ofstream fileOut("tmp_kp_shader.comp");
     fileOut << source;
@@ -53,6 +127,7 @@ std::vector<uint32_t> compileSource(const std::string& source) {
     return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
 }
 
+
 template<class T>
 std::vector<half> getVecBlockQ4_0D(T *x, unsigned nb) {
     std::vector<half> fres(nb);
@@ -90,12 +165,12 @@ static const std::string program_source_head = R"(
 #define QK4_0 32
 #define QR4_0 2
 #define QK4_1 32
-layout (local_size_x = 1) in;
 )";
 
 
 static const std::string program_dequantize_row_q4_0 =
         program_source_head+'\n'+MULTILINE_QUOTE(
+layout(local_size_x = 1, local_size_y = 1) in;
 layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
 layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
 layout(binding = 2) buffer tensorY { float y[]; };
@@ -143,6 +218,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
 
 static const std::string program_dequantize_row_q4_1 =
         program_source_head+'\n'+MULTILINE_QUOTE(
+layout(local_size_x = 1, local_size_y = 1) in;
 layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
 layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; };
 layout(binding = 2) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
@@ -191,6 +267,55 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) {
 }
 
 
+static const std::string program_abmath =
+        program_source_head+'\n'+MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+} pcs;
+
+
+layout(local_size_x = 1) in;
+layout(binding = 0) buffer tensorInA { float inA[]; };
+layout(binding = 1) buffer tensorInB { float inB[]; };
+layout(binding = 2) buffer tensorout { float out[]; };
+
+
+void main() {
+    const int i = int(gl_GlobalInvocationID.x);
+
+    out[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+i];
+}
+);
+
+template<char mathOP>
+void ggml_vk_abmath(const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
+                 const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
+                 std::shared_ptr<kp::Tensor>& out, uint32_t outOff) {
+    const static auto spirv = compileSource("#define MATH_OP "+std::string(1, mathOP)+'\n'+program_abmath);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+    } pushConsts {
+        inAOff, inBOff, outOff
+    };
+
+    mgr.sequence()
+            ->eval<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size(), inB->size())}, {}, {pushConsts}));
+}
+
+template <typename... Args>
+void ggml_vk_add(Args&&... args) {
+  return ggml_vk_abmath<'+'>(std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_mul(Args&&... args) {
+  return ggml_vk_abmath<'*'>(std::forward<Args>(args)...);
+}
+
+
 template<>
 kp::Tensor::TensorDataTypes
 kp::TensorT<half>::dataType()
@@ -204,3 +329,10 @@ kp::TensorT<uint8_t>::dataType()
 {
     return TensorDataTypes::eUnsignedInt;
 }
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT<byte>::dataType()
+{
+    return TensorDataTypes::eUnsignedInt;
+}
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index 34e6d46b3dbfa..649c34b537c28 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -1,12 +1,36 @@
 #pragma once
 
+#include <cstddef>
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
-void ggml_vk_init(void);
+struct ggml_kompute_context;
+
+
+ggml_kompute_context * ggml_vk_init(void);
+void ggml_metal_free(struct ggml_kompute_context * ctx);
+
+// creates a mapping between a host memory buffer and a device memory buffer
+// - make sure to map all buffers used in the graph before calling ggml_vk_graph_compute
+// - the mapping is used during computation to determine the arguments of the compute kernels
+// - you don't need to keep the host memory buffer allocated as it is never accessed by Vulkan
+// - max_size specifies the maximum size of a tensor and is used to create shared views such
+//   that it is guaranteed that the tensor will fit in at least one of the views
+//
+bool ggml_vk_add_buffer(
+      struct ggml_kompute_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size,
+                           size_t   max_size);
+
+void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 
 void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph);
 
 #ifdef  __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index e597f5048234b..824ed6121ce1d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -14,6 +14,8 @@
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
+#elif defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
 #endif
 
 #ifdef GGML_USE_METAL
@@ -280,6 +282,8 @@ struct llama_context {
 
 #ifdef GGML_USE_METAL
     ggml_metal_context * ctx_metal = NULL;
+#elif defined(GGML_USE_KOMPUTE)
+    ggml_kompute_context * ctx_kompute = NULL;
 #endif
 
     int    buf_last = 0;
@@ -1701,6 +1705,26 @@ static bool llama_eval_internal(
             ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
         }
 
+        ggml_graph_compute(ctx0, &gf);
+    }
+#elif defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute && N == 1) {
+        ggml_vk_graph_compute(lctx.ctx_kompute, &gf);
+        ggml_vk_get_tensor   (lctx.ctx_kompute, cur);
+    } else {
+        // IMPORTANT:
+        // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
+        // ggml_graph_compute().
+        //
+        // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
+        // But for now, we have focused only on Matrix x Vector Metal multiplication.
+        //
+        if (lctx.ctx_kompute) {
+            // We need to sync the GPU KV cache with the CPU KV cache
+            ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.v);
+        }
+
         ggml_graph_compute(ctx0, &gf);
     }
 #else
@@ -2743,6 +2767,42 @@ struct llama_context * llama_init_from_file(
         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
 #undef LLAMA_METAL_CHECK_BUF
     }
+#elif defined(GGML_USE_KOMPUTE)
+    if (params.n_gpu_layers > 0) {
+        // this allocates all Metal resources and memory buffers
+        ctx->ctx_kompute = ggml_vk_init();
+
+        void * data_ptr  = NULL;
+        size_t data_size = 0;
+
+        if (params.use_mmap) {
+            data_ptr  = ctx->model.mapping->addr;
+            data_size = ctx->model.mapping->size;
+        } else {
+            data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
+            data_size = ggml_get_mem_size  (ctx->model.ctx);
+        }
+
+        const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
+
+        printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+
+#define LLAMA_METAL_CHECK_BUF(result)                                          \
+    if (!(result)) {                                                           \
+        fprintf(stderr, "%s: failed to add buffer\n", __func__);               \
+        llama_free(ctx);                                                       \
+        return NULL;                                                           \
+    }
+
+        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "data", data_ptr, data_size, max_size));
+
+        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size,       0));
+        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
+
+        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
+        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
+#undef LLAMA_METAL_CHECK_BUF
+    }
 #endif
 
     return ctx;

From 9d643755a62075bad0570c54e82d87d4228d06ab Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 11:51:25 +0200
Subject: [PATCH 07/43] Fixed compile error

---
 ggml-vulkan.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index 649c34b537c28..a3bc781d7799c 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -1,15 +1,16 @@
 #pragma once
 
-#include <cstddef>
-
 #ifdef  __cplusplus
+#include <cstddef>
 extern "C" {
+#else
+#include <stddef.h>
 #endif
 
 struct ggml_kompute_context;
 
 
-ggml_kompute_context * ggml_vk_init(void);
+struct ggml_kompute_context * ggml_vk_init(void);
 void ggml_metal_free(struct ggml_kompute_context * ctx);
 
 // creates a mapping between a host memory buffer and a device memory buffer

From b8a4594f8930a53a099d91b0d77c7dd6242ee2af Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 12:19:33 +0200
Subject: [PATCH 08/43] More fixes...

---
 ggml-vulkan.cpp | 4 ++--
 ggml-vulkan.h   | 1 +
 ggml.c          | 2 +-
 llama.cpp       | 4 ++--
 llama.h         | 4 ++--
 5 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index b7e70e221a04c..7879a59379b87 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -64,8 +64,8 @@ bool ggml_vk_add_buffer(
                            size_t   size,
                            size_t   max_size) {
     try {
-        std::vector<byte> vec(max_size);
-        std::memcpy(vec.data(), data, std::max(size, max_size));
+        std::vector<byte> vec(std::max(size, max_size));
+        std::memcpy(vec.data(), data, size);
         auto tensor = mgr.tensorT<byte>(vec);
         ctx->buffers.emplace(name, std::move(tensor));
     } catch (const std::exception & e) {
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index a3bc781d7799c..b7f7371cb5ce2 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -31,6 +31,7 @@ void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
 void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 
 void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
+void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k);
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph);
 
 #ifdef  __cplusplus
diff --git a/ggml.c b/ggml.c
index 151b9eefbf4b7..34f0870002a41 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1558,7 +1558,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q4_1] = {
-        .dequantize_row_q         = (dequantize_row_q_t)dequantize_row_q4_1,
+        .dequantize_row_q         = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_1,
         .quantize_row_q           = quantize_row_q4_1,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
         .quantize_row_q_dot       = quantize_row_q8_1,
diff --git a/llama.cpp b/llama.cpp
index 824ed6121ce1d..85acd4e052ec2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1269,7 +1269,7 @@ static void llama_model_load_internal(
             }
         }
 #endif // GGML_USE_CUBLAS
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_KOMPUTE)
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
         fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -1707,7 +1707,7 @@ static bool llama_eval_internal(
 
         ggml_graph_compute(ctx0, &gf);
     }
-#elif defined(GGML_USE_KOMPUTE)
+#elif defined(GGML_USE_KOMPUTE_TODO)
     if (lctx.ctx_kompute && N == 1) {
         ggml_vk_graph_compute(lctx.ctx_kompute, &gf);
         ggml_vk_get_tensor   (lctx.ctx_kompute, cur);
diff --git a/llama.h b/llama.h
index 0de530d456932..446dd49b94de1 100644
--- a/llama.h
+++ b/llama.h
@@ -38,7 +38,7 @@
 #define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1
 
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
@@ -71,7 +71,7 @@ extern "C" {
 
     typedef void (*llama_progress_callback)(float progress, void *ctx);
 
-   struct llama_context_params {
+    struct llama_context_params {
         int seed;                              // RNG seed, -1 for random
         int n_ctx;                             // text context
         int n_batch;                           // prompt processing batch size

From d53924799631f93f9207c7be511cda5e75b33066 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 14:03:33 +0200
Subject: [PATCH 09/43] Began implementing ggml_graph_compute

---
 ggml-vulkan.cpp | 95 ++++++++++++++++++++++++++++++++++++++++++++++---
 ggml-vulkan.h   |  6 ++--
 llama.cpp       |  8 ++---
 3 files changed, 97 insertions(+), 12 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 7879a59379b87..d8cc9f1fa4ce3 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -8,6 +8,7 @@
 #include <unordered_map>
 #include <fstream>
 #include <exception>
+#include <thread>
 #include <cstring>
 #include <immintrin.h>
 #include <kompute/Kompute.hpp>
@@ -75,6 +76,7 @@ bool ggml_vk_add_buffer(
     return true;
 }
 
+static
 std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) {
     auto res = ctx->buffers.find(name);
     if (res == ctx->buffers.end()) return nullptr;
@@ -82,7 +84,7 @@ std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx
 }
 
 
-void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
     if (t->backend != GGML_BACKEND_GPU) {
         return;
     }
@@ -98,7 +100,7 @@ void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     ctx->tensors.emplace(t, std::move(tensor));
 }
 
-void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
     if (t->backend != GGML_BACKEND_GPU) {
         return;
     }
@@ -107,12 +109,23 @@ void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     auto size = ggml_nbytes(t);
 
     auto res = ctx->tensors.find(t);
+    assert(res != ctx->tensors.end());
 
     auto tensor = res->second;
     mgr.sequence()->eval<kp::OpTensorSyncLocal>({tensor});
     memcpy(data, tensor->data<void>(), size);
 }
 
+static
+const std::shared_ptr<kp::Tensor> & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    assert(t->backend != GGML_BACKEND_GPU);
+
+    auto res = ctx->tensors.find(t);
+    assert(res != ctx->tensors.end());
+
+    return res->second;
+}
+
 
 static std::vector<uint32_t> compileSource(const std::string& source) {
     //FIXME: Terrible solution!!!!
@@ -302,17 +315,89 @@ void ggml_vk_abmath(const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
     };
 
     mgr.sequence()
-            ->eval<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size(), inB->size())}, {}, {pushConsts}));
+            ->eval<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts}));
 }
 
 template <typename... Args>
 void ggml_vk_add(Args&&... args) {
-  return ggml_vk_abmath<'+'>(std::forward<Args>(args)...);
+    return ggml_vk_abmath<'+'>(std::forward<Args>(args)...);
 }
 
 template <typename... Args>
 void ggml_vk_mul(Args&&... args) {
-  return ggml_vk_abmath<'*'>(std::forward<Args>(args)...);
+    return ggml_vk_abmath<'*'>(std::forward<Args>(args)...);
+}
+
+
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
+    printf("%s: evaluating graph\n", __func__);
+
+    const int n_seq = gf->n_threads;
+
+    std::vector<kp::Sequence> sequences(n_seq);
+
+    std::vector<std::thread> threads(n_seq);
+
+    for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
+        const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
+
+        threads[seq_idx] = std::thread([&, seq_idx, n_nodes_per_seq] () {
+            size_t offs_src0 = 0;
+            size_t offs_src1 = 0;
+            size_t offs_dst  = 0;
+
+            auto& seq = sequences[seq_idx];
+
+            const int node_start = (seq_idx + 0) * n_nodes_per_seq;
+            const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
+
+            for (int i = node_start; i < node_end; ++i) {
+                printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+
+                struct ggml_tensor * src0 = gf->nodes[i]->src0;
+                struct ggml_tensor * src1 = gf->nodes[i]->src1;
+                struct ggml_tensor * dst = gf->nodes[i];
+
+                const int64_t ne00 = src0 ? src0->ne[0] : 0;
+                const int64_t ne01 = src0 ? src0->ne[1] : 0;
+                const int64_t ne02 = src0 ? src0->ne[2] : 0;
+                const int64_t ne03 = src0 ? src0->ne[3] : 0;
+
+                const uint64_t nb00 = src0 ? src0->nb[0] : 0;
+                const uint64_t nb01 = src0 ? src0->nb[1] : 0;
+                const uint64_t nb02 = src0 ? src0->nb[2] : 0;
+                const uint64_t nb03 = src0 ? src0->nb[3] : 0;
+
+                const int64_t ne10 = src1 ? src1->ne[0] : 0;
+                const int64_t ne11 = src1 ? src1->ne[1] : 0;
+                const int64_t ne12 = src1 ? src1->ne[2] : 0;
+                const int64_t ne13 = src1 ? src1->ne[3] : 0;  (void)ne13;
+
+                const uint64_t nb10 = src1 ? src1->nb[0] : 0;
+                const uint64_t nb11 = src1 ? src1->nb[1] : 0;
+                const uint64_t nb12 = src1 ? src1->nb[2] : 0;
+                const uint64_t nb13 = src1 ? src1->nb[3] : 0; (void)nb13;
+
+                const int64_t ne0 = dst ? dst->ne[0] : 0;
+                const int64_t ne1 = dst ? dst->ne[1] : 0;
+                const int64_t ne2 = dst ? dst->ne[2] : 0;
+                const int64_t ne3 = dst ? dst->ne[3] : 0;
+
+                const uint64_t nb0 = dst ? dst->nb[0] : 0;
+                const uint64_t nb1 = dst ? dst->nb[1] : 0;
+                const uint64_t nb2 = dst ? dst->nb[2] : 0;
+                const uint64_t nb3 = dst ? dst->nb[3] : 0;
+
+                const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+                const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+                const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
+
+                std::shared_ptr<kp::Tensor> id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullptr;
+                std::shared_ptr<kp::Tensor> id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullptr;
+                std::shared_ptr<kp::Tensor> id_dst  = dst  ? ggml_vk_get_tensor(ctx, dst)  : nullptr;
+            }
+        });
+    }
 }
 
 
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index b7f7371cb5ce2..19aaec949d126 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -27,12 +27,12 @@ bool ggml_vk_add_buffer(
                            size_t   size,
                            size_t   max_size);
 
-void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
-void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 
 void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
 void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k);
-void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph);
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
 
 #ifdef  __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index 85acd4e052ec2..89c7fa6560ca5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1707,10 +1707,10 @@ static bool llama_eval_internal(
 
         ggml_graph_compute(ctx0, &gf);
     }
-#elif defined(GGML_USE_KOMPUTE_TODO)
+#elif defined(GGML_USE_KOMPUTE)
     if (lctx.ctx_kompute && N == 1) {
         ggml_vk_graph_compute(lctx.ctx_kompute, &gf);
-        ggml_vk_get_tensor   (lctx.ctx_kompute, cur);
+        ggml_vk_d2h_tensor   (lctx.ctx_kompute, cur);
     } else {
         // IMPORTANT:
         // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1721,8 +1721,8 @@ static bool llama_eval_internal(
         //
         if (lctx.ctx_kompute) {
             // We need to sync the GPU KV cache with the CPU KV cache
-            ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.k);
-            ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.v);
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
         }
 
         ggml_graph_compute(ctx0, &gf);

From 18d6f7f8da06788788b4ec99c3dd7c90f52162e9 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 14:08:45 +0200
Subject: [PATCH 10/43] More progress...

---
 ggml-vulkan.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index d8cc9f1fa4ce3..3e7fe30a68ce6 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -395,6 +395,20 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 std::shared_ptr<kp::Tensor> id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullptr;
                 std::shared_ptr<kp::Tensor> id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullptr;
                 std::shared_ptr<kp::Tensor> id_dst  = dst  ? ggml_vk_get_tensor(ctx, dst)  : nullptr;
+
+                switch (dst->op) {
+                    case GGML_OP_RESHAPE:
+                    case GGML_OP_VIEW:
+                    case GGML_OP_TRANSPOSE:
+                    case GGML_OP_PERMUTE:
+                        {
+                            // noop
+                        } break;
+                    case GGML_OP_ADD:
+                        {
+                            ggml_vk_add(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst);
+                        } break;
+                }
             }
         });
     }

From b6264542b7bd3dc0461b08949c438771be47eca8 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 14:19:31 +0200
Subject: [PATCH 11/43] Added vk_mul to ggml_vk_graph_compute

---
 ggml-vulkan.cpp | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 3e7fe30a68ce6..57e1ebf6fe1dc 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -286,6 +286,7 @@ layout(push_constant) uniform PushConstants {
     uint inAOff;
     uint inBOff;
     uint outOff;
+    uint row;
 } pcs;
 
 
@@ -298,20 +299,23 @@ layout(binding = 2) buffer tensorout { float out[]; };
 void main() {
     const int i = int(gl_GlobalInvocationID.x);
 
-    out[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+i];
+    out[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+(i ROW_OP)];
 }
 );
 
 template<char mathOP>
 void ggml_vk_abmath(const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
                  const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
-                 std::shared_ptr<kp::Tensor>& out, uint32_t outOff) {
-    const static auto spirv = compileSource("#define MATH_OP "+std::string(1, mathOP)+'\n'+program_abmath);
+                 std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                 uint32_t row = 0) {
+    const static auto spirv = compileSource("#define MATH_OP "+std::string(1, mathOP)+"\n"
+                                            "#define ROW_OP "+(row?"% pcs.row":"")+"\n"
+                                            +program_abmath);
 
     struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
+        uint32_t inAOff, inBOff, outOff, row;
     } pushConsts {
-        inAOff, inBOff, outOff
+        inAOff, inBOff, outOff, row
     };
 
     mgr.sequence()
@@ -334,7 +338,11 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
 
     const int n_seq = gf->n_threads;
 
-    std::vector<kp::Sequence> sequences(n_seq);
+    std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
+
+    for (auto& sequence : sequences) {
+        sequence = mgr.sequence();
+    }
 
     std::vector<std::thread> threads(n_seq);
 
@@ -346,7 +354,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             size_t offs_src1 = 0;
             size_t offs_dst  = 0;
 
-            auto& seq = sequences[seq_idx];
+            auto& seq = *sequences[seq_idx];
 
             const int node_start = (seq_idx + 0) * n_nodes_per_seq;
             const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
@@ -408,6 +416,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                         {
                             ggml_vk_add(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst);
                         } break;
+                    case GGML_OP_MUL:
+                    {
+                        if (ggml_nelements(src1) == ne10) {
+                            // src1 is a row
+                            ggml_vk_mul(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00);
+                        } else {
+                            ggml_vk_mul(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst);
+                        }
+                    } break;
                 }
             }
         });

From 5e9403342b953adce49cf585ca6a6574af4c6c61 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 15:01:09 +0200
Subject: [PATCH 12/43] Minor fixes

---
 ggml-vulkan.cpp | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 57e1ebf6fe1dc..af697b2214cb2 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -18,8 +18,6 @@
 #endif
 
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
-#define STRINGIFY(x) STRINGIFY2(x)
-#define STRINGIFY2(x) #x
 
 #define QK4_0 32
 #define QR4_0 2
@@ -182,7 +180,7 @@ static const std::string program_source_head = R"(
 
 
 static const std::string program_dequantize_row_q4_0 =
-        program_source_head+'\n'+MULTILINE_QUOTE(
+        MULTILINE_QUOTE(
 layout(local_size_x = 1, local_size_y = 1) in;
 layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
 layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
@@ -209,7 +207,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
     static const int qk = QK4_0;
     const unsigned nb = k / qk;
     const unsigned y_size = nb*qk;
-    const static auto spirv = compileSource(program_dequantize_row_q4_0);
+    const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_0);
 
     const auto x = reinterpret_cast<const block_q4_0*>(x_);
 
@@ -230,7 +228,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
 
 
 static const std::string program_dequantize_row_q4_1 =
-        program_source_head+'\n'+MULTILINE_QUOTE(
+        MULTILINE_QUOTE(
 layout(local_size_x = 1, local_size_y = 1) in;
 layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
 layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; };
@@ -259,7 +257,7 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) {
     static const int qk = QK4_1;
     const unsigned nb = k / qk;
     const unsigned y_size = nb*qk;
-    const static auto spirv = compileSource(program_dequantize_row_q4_1);
+    const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_1);
 
     const auto x = reinterpret_cast<const block_q4_1*>(x_);
 
@@ -281,7 +279,7 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) {
 
 
 static const std::string program_abmath =
-        program_source_head+'\n'+MULTILINE_QUOTE(
+        MULTILINE_QUOTE(
 layout(push_constant) uniform PushConstants {
     uint inAOff;
     uint inBOff;
@@ -293,24 +291,25 @@ layout(push_constant) uniform PushConstants {
 layout(local_size_x = 1) in;
 layout(binding = 0) buffer tensorInA { float inA[]; };
 layout(binding = 1) buffer tensorInB { float inB[]; };
-layout(binding = 2) buffer tensorout { float out[]; };
+layout(binding = 2) buffer tensorOut { float out_[]; };
 
 
 void main() {
     const int i = int(gl_GlobalInvocationID.x);
 
-    out[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+(i ROW_OP)];
+    out_[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+(i ROW_OP)];
 }
 );
 
 template<char mathOP>
 void ggml_vk_abmath(const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
-                 const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
-                 std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
-                 uint32_t row = 0) {
-    const static auto spirv = compileSource("#define MATH_OP "+std::string(1, mathOP)+"\n"
-                                            "#define ROW_OP "+(row?"% pcs.row":"")+"\n"
-                                            +program_abmath);
+                    const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
+                    const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                    uint32_t row = 0) {
+    const static auto spirv = compileSource(program_source_head+
+                                            "#define MATH_OP "+std::string(1, mathOP)+"\n"
+                                            "#define ROW_OP "+(row?"% pcs.row":"")+'\n'+
+                                            program_abmath);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff, row;

From e830264c9240bda2c2976b4f36a7f028f765f550 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 15:10:24 +0200
Subject: [PATCH 13/43] Share sequence to functions and add scale()

---
 ggml-vulkan.cpp | 50 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index af697b2214cb2..45502ab5a5838 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -287,13 +287,11 @@ layout(push_constant) uniform PushConstants {
     uint row;
 } pcs;
 
-
 layout(local_size_x = 1) in;
 layout(binding = 0) buffer tensorInA { float inA[]; };
 layout(binding = 1) buffer tensorInB { float inB[]; };
 layout(binding = 2) buffer tensorOut { float out_[]; };
 
-
 void main() {
     const int i = int(gl_GlobalInvocationID.x);
 
@@ -302,7 +300,8 @@ void main() {
 );
 
 template<char mathOP>
-void ggml_vk_abmath(const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
+void ggml_vk_abmath(kp::Sequence& seq,
+                    const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
                     const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
                     const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
                     uint32_t row = 0) {
@@ -317,8 +316,7 @@ void ggml_vk_abmath(const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
         inAOff, inBOff, outOff, row
     };
 
-    mgr.sequence()
-            ->eval<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts}));
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts}));
 }
 
 template <typename... Args>
@@ -332,6 +330,42 @@ void ggml_vk_mul(Args&&... args) {
 }
 
 
+static const std::string program_scale =
+        MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inOff;
+    float scale;
+} pcs;
+
+layout(local_size_x = 1) in;
+layout(binding = 0) buffer tensorInA { float in_[]; };
+layout(binding = 1) buffer tensorOut { float out_[]; };
+
+void main() {
+    const int i = int(gl_GlobalInvocationID.x);
+
+    out_[pcs.outOff+i] = in_[pcs.inOff+i] * pcs.scale;
+}
+);
+
+void ggml_vk_scale(kp::Sequence& seq,
+                   const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
+                   const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                   float scale) {
+    const static auto spirv = compileSource(program_source_head+program_scale);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        float scale;
+    } pushConsts {
+        inOff, outOff, scale
+    };
+
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {in->size()-inOff}, {}, {pushConsts}));
+}
+
+
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
     printf("%s: evaluating graph\n", __func__);
 
@@ -413,15 +447,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                         } break;
                     case GGML_OP_ADD:
                         {
-                            ggml_vk_add(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst);
+                            ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst);
                         } break;
                     case GGML_OP_MUL:
                     {
                         if (ggml_nelements(src1) == ne10) {
                             // src1 is a row
-                            ggml_vk_mul(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00);
+                            ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00);
                         } else {
-                            ggml_vk_mul(id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst);
+                            ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst);
                         }
                     } break;
                 }

From 5c0d8dd0f23170d62893e67a931c689b672d80a6 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 15:58:13 +0200
Subject: [PATCH 14/43] Specify program output size

---
 ggml-vulkan.cpp | 74 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 59 insertions(+), 15 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 45502ab5a5838..ed6e704f47df3 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -293,7 +293,7 @@ layout(binding = 1) buffer tensorInB { float inB[]; };
 layout(binding = 2) buffer tensorOut { float out_[]; };
 
 void main() {
-    const int i = int(gl_GlobalInvocationID.x);
+    const uint i = gl_GlobalInvocationID.x;
 
     out_[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+(i ROW_OP)];
 }
@@ -304,7 +304,7 @@ void ggml_vk_abmath(kp::Sequence& seq,
                     const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
                     const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
                     const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
-                    uint32_t row = 0) {
+                    uint32_t size, uint32_t row = 0) {
     const static auto spirv = compileSource(program_source_head+
                                             "#define MATH_OP "+std::string(1, mathOP)+"\n"
                                             "#define ROW_OP "+(row?"% pcs.row":"")+'\n'+
@@ -316,7 +316,7 @@ void ggml_vk_abmath(kp::Sequence& seq,
         inAOff, inBOff, outOff, row
     };
 
-    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts}));
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {size}, {}, {pushConsts}));
 }
 
 template <typename... Args>
@@ -343,7 +343,7 @@ layout(binding = 0) buffer tensorInA { float in_[]; };
 layout(binding = 1) buffer tensorOut { float out_[]; };
 
 void main() {
-    const int i = int(gl_GlobalInvocationID.x);
+    const uint i = gl_GlobalInvocationID.x;
 
     out_[pcs.outOff+i] = in_[pcs.inOff+i] * pcs.scale;
 }
@@ -352,7 +352,7 @@ void main() {
 void ggml_vk_scale(kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
                    const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
-                   float scale) {
+                   uint32_t size, float scale) {
     const static auto spirv = compileSource(program_source_head+program_scale);
 
     struct PushConstants {
@@ -362,7 +362,42 @@ void ggml_vk_scale(kp::Sequence& seq,
         inOff, outOff, scale
     };
 
-    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {in->size()-inOff}, {}, {pushConsts}));
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {size}, {}, {pushConsts}));
+}
+
+
+static const std::string program_silu =
+        MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inOff;
+} pcs;
+
+layout(local_size_x = 1) in;
+layout(binding = 0) buffer tensorInA { float in_[]; };
+layout(binding = 1) buffer tensorOut { float out_[]; };
+
+void main() {
+    const uint i = gl_GlobalInvocationID.x;
+    const float x = in_[pcs.inOff+i];
+
+    out_[pcs.outOff+i] = x / (1.0f + exp(-x));
+}
+);
+
+void ggml_vk_silu(kp::Sequence& seq,
+                  const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
+                  const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                  uint32_t size) {
+    const static auto spirv = compileSource(program_source_head+program_silu);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+    } pushConsts {
+        inOff, outOff
+    };
+
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {size}, {}, {pushConsts}));
 }
 
 
@@ -447,17 +482,26 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                         } break;
                     case GGML_OP_ADD:
                         {
-                            ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst);
+                            ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
                         } break;
                     case GGML_OP_MUL:
-                    {
-                        if (ggml_nelements(src1) == ne10) {
-                            // src1 is a row
-                            ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00);
-                        } else {
-                            ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst);
-                        }
-                    } break;
+                        {
+                            if (ggml_nelements(src1) == ne10) {
+                                // src1 is a row
+                                ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ggml_nelements(dst));
+                            } else {
+                                ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
+                            }
+                        } break;
+                    case GGML_OP_SCALE:
+                        {
+                            const float scale = *(const float *) src1->data;
+                            ggml_vk_scale(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst), scale);
+                        } break;
+                    case GGML_OP_SILU:
+                        {
+                            ggml_vk_silu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
+                        } break;
                 }
             }
         });

From 2589cb0c704189da7f8bf92fef276489f75cb548 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 16:02:49 +0200
Subject: [PATCH 15/43] Prevent compileSource race

---
 ggml-vulkan.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index ed6e704f47df3..5e1d206bdf76c 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -9,6 +9,7 @@
 #include <fstream>
 #include <exception>
 #include <thread>
+#include <mutex>
 #include <cstring>
 #include <immintrin.h>
 #include <kompute/Kompute.hpp>
@@ -126,6 +127,8 @@ const std::shared_ptr<kp::Tensor> & ggml_vk_get_tensor(struct ggml_kompute_conte
 
 
 static std::vector<uint32_t> compileSource(const std::string& source) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> L(mutex);
     //FIXME: Terrible solution!!!!
     std::ofstream fileOut("tmp_kp_shader.comp");
     fileOut << source;
@@ -176,6 +179,8 @@ static const std::string program_source_head = R"(
 #define QK4_0 32
 #define QR4_0 2
 #define QK4_1 32
+#define GELU_COEF_A 0.044715;
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876;
 )";
 
 

From 09b0b3a49ba757ef44b5e8387eab3a9af4521c0d Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 16:13:32 +0200
Subject: [PATCH 16/43] Wait for all threads to finish

---
 ggml-vulkan.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 5e1d206bdf76c..c35509bd8b223 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -511,6 +511,12 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             }
         });
     }
+
+    // Wait for all threads to finish
+    for (auto& thread : threads) {
+        if (thread.joinable())
+            thread.join();
+    }
 }
 
 

From 98e588c6eb54e7fe9726ad46c98d30bac5473a8b Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 16:50:37 +0200
Subject: [PATCH 17/43] Fix ggml_vk_h2d_tensor throwing on second call

---
 ggml-vulkan.cpp | 18 +++++++++++++-----
 llama.cpp       | 16 ++++++++--------
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index c35509bd8b223..c260c59c21402 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -91,12 +91,20 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     auto data = t->data;
     auto size = ggml_nbytes(t);
 
-    std::vector<byte> vec(size);
-    memcpy(vec.data(), data, size);
+    auto res = ctx->tensors.find(t);
+
+    if (res != ctx->tensors.end()) {
+        assert(res->second->size() != size);
+        res->second->setRawData(data);
+        mgr.sequence()->eval<kp::OpTensorSyncDevice>({res->second});
+    } else {
+        std::vector<byte> vec(size);
+        memcpy(vec.data(), data, size);
 
-    auto tensor = mgr.tensorT<byte>(vec);
-    mgr.sequence()->eval<kp::OpTensorSyncDevice>({tensor});
-    ctx->tensors.emplace(t, std::move(tensor));
+        auto tensor = mgr.tensorT<byte>(vec);
+        mgr.sequence()->eval<kp::OpTensorSyncDevice>({tensor});
+        ctx->tensors.emplace(t, std::move(tensor));
+    }
 }
 
 void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
diff --git a/llama.cpp b/llama.cpp
index 89c7fa6560ca5..cbe285afb743a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2769,7 +2769,7 @@ struct llama_context * llama_init_from_file(
     }
 #elif defined(GGML_USE_KOMPUTE)
     if (params.n_gpu_layers > 0) {
-        // this allocates all Metal resources and memory buffers
+        // this allocates all Vulkan resources and memory buffers
         ctx->ctx_kompute = ggml_vk_init();
 
         void * data_ptr  = NULL;
@@ -2787,21 +2787,21 @@ struct llama_context * llama_init_from_file(
 
         printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
 
-#define LLAMA_METAL_CHECK_BUF(result)                                          \
+#define LLAMA_VK_CHECK_BUF(result)                                          \
     if (!(result)) {                                                           \
         fprintf(stderr, "%s: failed to add buffer\n", __func__);               \
         llama_free(ctx);                                                       \
         return NULL;                                                           \
     }
 
-        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "data", data_ptr, data_size, max_size));
+        LLAMA_VK_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "data", data_ptr, data_size, max_size));
 
-        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size,       0));
-        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
+        LLAMA_VK_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size,       0));
+        LLAMA_VK_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
 
-        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
-        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
-#undef LLAMA_METAL_CHECK_BUF
+        LLAMA_VK_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
+        LLAMA_VK_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
+#undef LLAMA_VK_CHECK_BUF
     }
 #endif
 

From 46f577bfc1e29b4397e6958bc4400b326456c314 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 17:10:45 +0200
Subject: [PATCH 18/43] h2d tensors during loadup

---
 ggml-vulkan.cpp | 86 +++++++++++++++++++++++++++++++++++++++++++------
 llama.cpp       | 12 +++++--
 2 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index c260c59c21402..0f454c899cd94 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -378,6 +378,19 @@ void ggml_vk_scale(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {size}, {}, {pushConsts}));
 }
 
+void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+                  const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
+                  const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                  uint32_t size) {
+    struct PushConstants {
+        uint32_t inOff, outOff;
+    } pushConsts {
+        inOff, outOff
+    };
+
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {size}, {}, {pushConsts}));
+}
+
 
 static const std::string program_silu =
         MULTILINE_QUOTE(
@@ -398,19 +411,64 @@ void main() {
 }
 );
 
-void ggml_vk_silu(kp::Sequence& seq,
-                  const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
-                  const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
-                  uint32_t size) {
+template <typename... Args>
+void ggml_vk_silu(Args&&... args) {
     const static auto spirv = compileSource(program_source_head+program_silu);
 
-    struct PushConstants {
-        uint32_t inOff, outOff;
-    } pushConsts {
-        inOff, outOff
-    };
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
 
-    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {size}, {}, {pushConsts}));
+
+static const std::string program_relu =
+        MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inOff;
+} pcs;
+
+layout(local_size_x = 1) in;
+layout(binding = 0) buffer tensorInA { float in_[]; };
+layout(binding = 1) buffer tensorOut { float out_[]; };
+
+void main() {
+    const uint i = gl_GlobalInvocationID.x;
+
+    out_[pcs.outOff+i] = max(0.0, in_[pcs.inOff+i]);
+}
+);
+
+template <typename... Args>
+void ggml_vk_relu(Args&&... args) {
+    const static auto spirv = compileSource(program_source_head+program_relu);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
+
+
+static const std::string program_gelu =
+        MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inOff;
+} pcs;
+
+layout(local_size_x = 1) in;
+layout(binding = 0) buffer tensorInA { float in_[]; };
+layout(binding = 1) buffer tensorOut { float out_[]; };
+
+void main() {
+    const uint i = gl_GlobalInvocationID.x;
+    const float x = in_[pcs.inOff+i];
+
+    out_[pcs.outOff+i] = 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
+}
+);
+
+template <typename... Args>
+void ggml_vk_gelu(Args&&... args) {
+    const static auto spirv = compileSource(program_source_head+program_gelu);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
 }
 
 
@@ -515,6 +573,14 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                         {
                             ggml_vk_silu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
                         } break;
+                    case GGML_OP_RELU:
+                        {
+                            ggml_vk_relu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
+                        } break;
+                    case GGML_OP_GELU:
+                        {
+                            ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
+                        } break;
                 }
             }
         });
diff --git a/llama.cpp b/llama.cpp
index cbe285afb743a..be4b5ca6872a8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -753,7 +753,7 @@ struct llama_model_loader {
         }
     }
 
-    void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
+    void load_all_data(llama_context & lctx, llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
         size_t data_size = 0;
         size_t prefetch_size = 0;
         size_t lock_size = 0;
@@ -810,6 +810,14 @@ struct llama_model_loader {
                         free(lt.data);
                     }
                     break;
+#elif defined(GGML_USE_KOMPUTE)
+                case GGML_BACKEND_GPU:
+                    lt.ggml_tensor->data = lt.data;
+                    ggml_vk_h2d_tensor(lctx.ctx_kompute, lt.ggml_tensor);
+                    if (!use_mmap) {
+                        free(lt.data);
+                    }
+                    break;
 #endif
                 default:
                     continue;
@@ -1315,7 +1323,7 @@ static void llama_model_load_internal(
     }
 #endif
 
-    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
+    ml->load_all_data(lctx, progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
 
     if (progress_callback) {
         progress_callback(1.0f, progress_callback_user_data);

From 1a6819540856b8bf78958918f5b0279e080b99dc Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 17:46:09 +0200
Subject: [PATCH 19/43] Add mutexes for gpu tensors

---
 ggml-vulkan.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 0f454c899cd94..f8b24f706b4ca 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -42,6 +42,7 @@ typedef struct {
 struct ggml_kompute_context {
     std::unordered_map<const char *, std::shared_ptr<kp::Tensor>> buffers;
     std::unordered_map<struct ggml_tensor *, std::shared_ptr<kp::Tensor>> tensors;
+    std::mutex tensors_mutex;
 };
 
 
@@ -63,6 +64,8 @@ bool ggml_vk_add_buffer(
                              void * data,
                            size_t   size,
                            size_t   max_size) {
+    printf("%s: Context: %p Name: '%s'\n", __func__, ctx, name);
+
     try {
         std::vector<byte> vec(std::max(size, max_size));
         std::memcpy(vec.data(), data, size);
@@ -77,6 +80,8 @@ bool ggml_vk_add_buffer(
 
 static
 std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) {
+    printf("%s: Context: %p Name: '%s'\n", __func__, ctx, name);
+
     auto res = ctx->buffers.find(name);
     if (res == ctx->buffers.end()) return nullptr;
     return res->second;
@@ -84,6 +89,8 @@ std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx
 
 
 void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t);
+
     if (t->backend != GGML_BACKEND_GPU) {
         return;
     }
@@ -91,7 +98,9 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     auto data = t->data;
     auto size = ggml_nbytes(t);
 
+    ctx->tensors_mutex.lock();
     auto res = ctx->tensors.find(t);
+    ctx->tensors_mutex.unlock();
 
     if (res != ctx->tensors.end()) {
         assert(res->second->size() != size);
@@ -103,11 +112,15 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
 
         auto tensor = mgr.tensorT<byte>(vec);
         mgr.sequence()->eval<kp::OpTensorSyncDevice>({tensor});
+        ctx->tensors_mutex.lock();
         ctx->tensors.emplace(t, std::move(tensor));
+        ctx->tensors_mutex.unlock();
     }
 }
 
 void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t);
+
     if (t->backend != GGML_BACKEND_GPU) {
         return;
     }
@@ -115,7 +128,9 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     auto data = t->data;
     auto size = ggml_nbytes(t);
 
+    ctx->tensors_mutex.lock();
     auto res = ctx->tensors.find(t);
+    ctx->tensors_mutex.unlock();
     assert(res != ctx->tensors.end());
 
     auto tensor = res->second;
@@ -125,9 +140,13 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
 
 static
 const std::shared_ptr<kp::Tensor> & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t);
+
     assert(t->backend != GGML_BACKEND_GPU);
 
+    ctx->tensors_mutex.lock();
     auto res = ctx->tensors.find(t);
+    ctx->tensors_mutex.unlock();
     assert(res != ctx->tensors.end());
 
     return res->second;

From e6da9bd96b3444941421e71a0962976d9931a773 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 17:57:09 +0200
Subject: [PATCH 20/43] Added ggml_vk_mem_used()

---
 ggml-vulkan.cpp | 17 ++++++++++++++++-
 ggml-vulkan.h   |  4 +++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index f8b24f706b4ca..12ed52fed6fc9 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -10,6 +10,7 @@
 #include <exception>
 #include <thread>
 #include <mutex>
+#include <atomic>
 #include <cstring>
 #include <immintrin.h>
 #include <kompute/Kompute.hpp>
@@ -53,11 +54,25 @@ ggml_kompute_context *ggml_vk_init() {
     return new ggml_kompute_context;
 }
 
-void ggml_metal_free(struct ggml_kompute_context * ctx) {
+void ggml_vk_free(struct ggml_kompute_context * ctx) {
     delete ctx;
 }
 
 
+size_t ggml_vk_mem_used(struct ggml_kompute_context * ctx) {
+    size_t fres = 0;
+    ctx->tensors_mutex.lock();
+    for (const auto& tensor : ctx->tensors) {
+        fres += tensor.second->size();
+    }
+    ctx->tensors_mutex.unlock();
+    for (const auto& buffer : ctx->buffers) {
+        fres += buffer.second->size();
+    }
+    return fres;
+}
+
+
 bool ggml_vk_add_buffer(
       struct ggml_kompute_context * ctx,
                        const char * name,
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index 19aaec949d126..5ec392782e0cd 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -11,7 +11,9 @@ struct ggml_kompute_context;
 
 
 struct ggml_kompute_context * ggml_vk_init(void);
-void ggml_metal_free(struct ggml_kompute_context * ctx);
+void ggml_vk_free(struct ggml_kompute_context * ctx);
+
+size_t ggml_vk_mem_used(struct ggml_kompute_context * ctx);
 
 // creates a mapping between a host memory buffer and a device memory buffer
 // - make sure to map all buffers used in the graph before calling ggml_vk_graph_compute

From 40621ea0ec038bb0a360d1579999f6c8a3f73f88 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 18:26:21 +0200
Subject: [PATCH 21/43] Added more debugging

---
 ggml-vulkan.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 12ed52fed6fc9..9e422430929fe 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -118,7 +118,7 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     ctx->tensors_mutex.unlock();
 
     if (res != ctx->tensors.end()) {
-        assert(res->second->size() != size);
+        GGML_ASSERT(res->second->size() != size);
         res->second->setRawData(data);
         mgr.sequence()->eval<kp::OpTensorSyncDevice>({res->second});
     } else {
@@ -146,7 +146,7 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     ctx->tensors_mutex.lock();
     auto res = ctx->tensors.find(t);
     ctx->tensors_mutex.unlock();
-    assert(res != ctx->tensors.end());
+    GGML_ASSERT(res != ctx->tensors.end());
 
     auto tensor = res->second;
     mgr.sequence()->eval<kp::OpTensorSyncLocal>({tensor});
@@ -157,12 +157,12 @@ static
 const std::shared_ptr<kp::Tensor> & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
     printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t);
 
-    assert(t->backend != GGML_BACKEND_GPU);
+    GGML_ASSERT(t->backend != GGML_BACKEND_GPU);
 
     ctx->tensors_mutex.lock();
     auto res = ctx->tensors.find(t);
     ctx->tensors_mutex.unlock();
-    assert(res != ctx->tensors.end());
+    GGML_ASSERT(res != ctx->tensors.end());
 
     return res->second;
 }
@@ -258,7 +258,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
 
     const auto x = reinterpret_cast<const block_q4_0*>(x_);
 
-    assert(k % qk == 0);
+    GGML_ASSERT(k % qk == 0);
 
     const auto tensorBlockQ4_0D = mgr.tensorT<half>(getVecBlockQ4_0D(x, nb));
     const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(getVecBlockQ4_0QS(x, nb, qk));
@@ -308,7 +308,7 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) {
 
     const auto x = reinterpret_cast<const block_q4_1*>(x_);
 
-    assert(k % qk == 0);
+    GGML_ASSERT(k % qk == 0);
 
     const auto tensorBlockQ4_0D = mgr.tensorT<half>(getVecBlockQ4_0D(x, nb));
     const auto tensorBlockQ4_0M = mgr.tensorT<half>(getVecBlockQ4_0M(x, nb));
@@ -615,8 +615,14 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                         {
                             ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
                         } break;
+                    default:
+                        fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                        GGML_ASSERT(false);
                 }
             }
+
+            // Evaluate sequence
+            seq.eval();
         });
     }
 

From 4b267e88b683f91e9a292ae4fc53f497835b9421 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 18:40:58 +0200
Subject: [PATCH 22/43] Temporarily care for all layers

---
 ggml-vulkan.cpp | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 9e422430929fe..2b2e3378c4727 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -106,10 +106,6 @@ std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx
 void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
     printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t);
 
-    if (t->backend != GGML_BACKEND_GPU) {
-        return;
-    }
-
     auto data = t->data;
     auto size = ggml_nbytes(t);
 
@@ -121,6 +117,7 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
         GGML_ASSERT(res->second->size() != size);
         res->second->setRawData(data);
         mgr.sequence()->eval<kp::OpTensorSyncDevice>({res->second});
+        printf("%s: Updating Host->GPU tensor: %p\n", __func__, t);
     } else {
         std::vector<byte> vec(size);
         memcpy(vec.data(), data, size);
@@ -130,16 +127,13 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
         ctx->tensors_mutex.lock();
         ctx->tensors.emplace(t, std::move(tensor));
         ctx->tensors_mutex.unlock();
+        printf("%s: Creating Host->GPU tensor: %p\n", __func__, t);
     }
 }
 
 void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
     printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t);
 
-    if (t->backend != GGML_BACKEND_GPU) {
-        return;
-    }
-
     auto data = t->data;
     auto size = ggml_nbytes(t);
 
@@ -151,18 +145,21 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     auto tensor = res->second;
     mgr.sequence()->eval<kp::OpTensorSyncLocal>({tensor});
     memcpy(data, tensor->data<void>(), size);
+    printf("%s: Updating GPU->Host tensor: %p\n", __func__, t);
 }
 
 static
 const std::shared_ptr<kp::Tensor> & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
     printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t);
 
-    GGML_ASSERT(t->backend != GGML_BACKEND_GPU);
-
     ctx->tensors_mutex.lock();
     auto res = ctx->tensors.find(t);
     ctx->tensors_mutex.unlock();
-    GGML_ASSERT(res != ctx->tensors.end());
+
+    if (res == ctx->tensors.end()) {
+        ggml_vk_h2d_tensor(ctx, t);
+        return ggml_vk_get_tensor(ctx, t);
+    }
 
     return res->second;
 }

From 55815b67f41998ce32e152569f730286480a8068 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 19:58:41 +0200
Subject: [PATCH 23/43] Improved memory safety

---
 ggml-vulkan.cpp | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 2b2e3378c4727..5007629103af1 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -94,26 +94,26 @@ bool ggml_vk_add_buffer(
 }
 
 static
-std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) {
+kp::Tensor* ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) {
     printf("%s: Context: %p Name: '%s'\n", __func__, ctx, name);
 
-    auto res = ctx->buffers.find(name);
+    const auto res = ctx->buffers.find(name);
     if (res == ctx->buffers.end()) return nullptr;
-    return res->second;
+    return res->second.get();
 }
 
 
 void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
     printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t);
 
-    auto data = t->data;
-    auto size = ggml_nbytes(t);
+    const auto data = t->data;
+    const auto size = ggml_nbytes(t);
 
     ctx->tensors_mutex.lock();
-    auto res = ctx->tensors.find(t);
-    ctx->tensors_mutex.unlock();
+    const auto res = ctx->tensors.find(t);
 
     if (res != ctx->tensors.end()) {
+        ctx->tensors_mutex.unlock();
         GGML_ASSERT(res->second->size() != size);
         res->second->setRawData(data);
         mgr.sequence()->eval<kp::OpTensorSyncDevice>({res->second});
@@ -124,7 +124,6 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
 
         auto tensor = mgr.tensorT<byte>(vec);
         mgr.sequence()->eval<kp::OpTensorSyncDevice>({tensor});
-        ctx->tensors_mutex.lock();
         ctx->tensors.emplace(t, std::move(tensor));
         ctx->tensors_mutex.unlock();
         printf("%s: Creating Host->GPU tensor: %p\n", __func__, t);
@@ -134,15 +133,15 @@ void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
 void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
     printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t);
 
-    auto data = t->data;
-    auto size = ggml_nbytes(t);
+    const auto data = t->data;
+    const auto size = ggml_nbytes(t);
 
     ctx->tensors_mutex.lock();
-    auto res = ctx->tensors.find(t);
+    const auto res = ctx->tensors.find(t);
     ctx->tensors_mutex.unlock();
     GGML_ASSERT(res != ctx->tensors.end());
 
-    auto tensor = res->second;
+    auto& tensor = res->second;
     mgr.sequence()->eval<kp::OpTensorSyncLocal>({tensor});
     memcpy(data, tensor->data<void>(), size);
     printf("%s: Updating GPU->Host tensor: %p\n", __func__, t);
@@ -153,10 +152,11 @@ const std::shared_ptr<kp::Tensor> & ggml_vk_get_tensor(struct ggml_kompute_conte
     printf("%s: Context: %p Tensor: %p\n", __func__, ctx, t);
 
     ctx->tensors_mutex.lock();
-    auto res = ctx->tensors.find(t);
+    const auto res = ctx->tensors.find(t);
+    const auto end = ctx->tensors.end();
     ctx->tensors_mutex.unlock();
 
-    if (res == ctx->tensors.end()) {
+    if (res == end) {
         ggml_vk_h2d_tensor(ctx, t);
         return ggml_vk_get_tensor(ctx, t);
     }
@@ -356,7 +356,7 @@ void ggml_vk_abmath(kp::Sequence& seq,
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff, row;
-    } pushConsts {
+    } const pushConsts {
         inAOff, inBOff, outOff, row
     };
 
@@ -370,6 +370,7 @@ void ggml_vk_add(Args&&... args) {
 
 template <typename... Args>
 void ggml_vk_mul(Args&&... args) {
+    printf("%s: multiplying...\n", __func__);
     return ggml_vk_abmath<'*'>(std::forward<Args>(args)...);
 }
 
@@ -377,13 +378,13 @@ void ggml_vk_mul(Args&&... args) {
 static const std::string program_scale =
         MULTILINE_QUOTE(
 layout(push_constant) uniform PushConstants {
-    uint inAOff;
     uint inOff;
+    uint outOff;
     float scale;
 } pcs;
 
 layout(local_size_x = 1) in;
-layout(binding = 0) buffer tensorInA { float in_[]; };
+layout(binding = 0) buffer tensorIn { float in_[]; };
 layout(binding = 1) buffer tensorOut { float out_[]; };
 
 void main() {
@@ -402,7 +403,7 @@ void ggml_vk_scale(kp::Sequence& seq,
     struct PushConstants {
         uint32_t inOff, outOff;
         float scale;
-    } pushConsts {
+    } const pushConsts {
         inOff, outOff, scale
     };
 
@@ -415,7 +416,7 @@ void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
                   uint32_t size) {
     struct PushConstants {
         uint32_t inOff, outOff;
-    } pushConsts {
+    } const pushConsts {
         inOff, outOff
     };
 
@@ -426,8 +427,8 @@ void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
 static const std::string program_silu =
         MULTILINE_QUOTE(
 layout(push_constant) uniform PushConstants {
-    uint inAOff;
     uint inOff;
+    uint outOff;
 } pcs;
 
 layout(local_size_x = 1) in;
@@ -614,7 +615,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                         } break;
                     default:
                         fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                        GGML_ASSERT(false);
+                        //GGML_ASSERT(false);
                 }
             }
 

From e0814f86a25d1f6e56db9e369b688bd8167e941c Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 20:02:46 +0200
Subject: [PATCH 24/43] Free vk context

---
 llama.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index be4b5ca6872a8..740726445465a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2817,6 +2817,9 @@ struct llama_context * llama_init_from_file(
 }
 
 void llama_free(struct llama_context * ctx) {
+#ifdef GGML_USE_KOMPUTE
+    ggml_vk_free(ctx->ctx_kompute);
+#endif
     delete ctx;
 }
 

From 5d5f66d1d914d520a3f40099881571024088a072 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 20:37:58 +0200
Subject: [PATCH 25/43] More little fixes and stuff

---
 ggml-vulkan.cpp | 231 ++++++++++++++++++++++++------------------------
 1 file changed, 114 insertions(+), 117 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 5007629103af1..4d4f31e77a6ab 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -165,14 +165,15 @@ const std::shared_ptr<kp::Tensor> & ggml_vk_get_tensor(struct ggml_kompute_conte
 }
 
 
-static std::vector<uint32_t> compileSource(const std::string& source) {
+static std::vector<uint32_t> compileSource(const std::string& source, const char *debug_name) {
+    printf("%s: Compiling compute program: %s\n", __func__, debug_name);
     static std::mutex mutex;
     std::lock_guard<std::mutex> L(mutex);
     //FIXME: Terrible solution!!!!
     std::ofstream fileOut("tmp_kp_shader.comp");
     fileOut << source;
     fileOut.close();
-    if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str()))
+    if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv > /dev/null").c_str()))
         throw std::runtime_error("Error running glslangValidator command");
     std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
     std::vector<char> buffer;
@@ -251,7 +252,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
     static const int qk = QK4_0;
     const unsigned nb = k / qk;
     const unsigned y_size = nb*qk;
-    const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_0);
+    const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_0, __func__);
 
     const auto x = reinterpret_cast<const block_q4_0*>(x_);
 
@@ -301,7 +302,7 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) {
     static const int qk = QK4_1;
     const unsigned nb = k / qk;
     const unsigned y_size = nb*qk;
-    const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_1);
+    const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_1, __func__);
 
     const auto x = reinterpret_cast<const block_q4_1*>(x_);
 
@@ -352,7 +353,7 @@ void ggml_vk_abmath(kp::Sequence& seq,
     const static auto spirv = compileSource(program_source_head+
                                             "#define MATH_OP "+std::string(1, mathOP)+"\n"
                                             "#define ROW_OP "+(row?"% pcs.row":"")+'\n'+
-                                            program_abmath);
+                                            program_abmath, __func__);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff, row;
@@ -370,7 +371,6 @@ void ggml_vk_add(Args&&... args) {
 
 template <typename... Args>
 void ggml_vk_mul(Args&&... args) {
-    printf("%s: multiplying...\n", __func__);
     return ggml_vk_abmath<'*'>(std::forward<Args>(args)...);
 }
 
@@ -398,7 +398,7 @@ void ggml_vk_scale(kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
                    const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
                    uint32_t size, float scale) {
-    const static auto spirv = compileSource(program_source_head+program_scale);
+    const static auto spirv = compileSource(program_source_head+program_scale, __func__);
 
     struct PushConstants {
         uint32_t inOff, outOff;
@@ -445,7 +445,7 @@ void main() {
 
 template <typename... Args>
 void ggml_vk_silu(Args&&... args) {
-    const static auto spirv = compileSource(program_source_head+program_silu);
+    const static auto spirv = compileSource(program_source_head+program_silu, __func__);
 
     ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
 }
@@ -471,7 +471,7 @@ void main() {
 
 template <typename... Args>
 void ggml_vk_relu(Args&&... args) {
-    const static auto spirv = compileSource(program_source_head+program_relu);
+    const static auto spirv = compileSource(program_source_head+program_relu, __func__);
 
     ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
 }
@@ -498,7 +498,7 @@ void main() {
 
 template <typename... Args>
 void ggml_vk_gelu(Args&&... args) {
-    const static auto spirv = compileSource(program_source_head+program_gelu);
+    const static auto spirv = compileSource(program_source_head+program_gelu, __func__);
 
     ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
 }
@@ -514,120 +514,117 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
     for (auto& sequence : sequences) {
         sequence = mgr.sequence();
     }
-
-    std::vector<std::thread> threads(n_seq);
-
     for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
         const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
 
-        threads[seq_idx] = std::thread([&, seq_idx, n_nodes_per_seq] () {
-            size_t offs_src0 = 0;
-            size_t offs_src1 = 0;
-            size_t offs_dst  = 0;
-
-            auto& seq = *sequences[seq_idx];
-
-            const int node_start = (seq_idx + 0) * n_nodes_per_seq;
-            const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
-
-            for (int i = node_start; i < node_end; ++i) {
-                printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
-
-                struct ggml_tensor * src0 = gf->nodes[i]->src0;
-                struct ggml_tensor * src1 = gf->nodes[i]->src1;
-                struct ggml_tensor * dst = gf->nodes[i];
-
-                const int64_t ne00 = src0 ? src0->ne[0] : 0;
-                const int64_t ne01 = src0 ? src0->ne[1] : 0;
-                const int64_t ne02 = src0 ? src0->ne[2] : 0;
-                const int64_t ne03 = src0 ? src0->ne[3] : 0;
-
-                const uint64_t nb00 = src0 ? src0->nb[0] : 0;
-                const uint64_t nb01 = src0 ? src0->nb[1] : 0;
-                const uint64_t nb02 = src0 ? src0->nb[2] : 0;
-                const uint64_t nb03 = src0 ? src0->nb[3] : 0;
-
-                const int64_t ne10 = src1 ? src1->ne[0] : 0;
-                const int64_t ne11 = src1 ? src1->ne[1] : 0;
-                const int64_t ne12 = src1 ? src1->ne[2] : 0;
-                const int64_t ne13 = src1 ? src1->ne[3] : 0;  (void)ne13;
-
-                const uint64_t nb10 = src1 ? src1->nb[0] : 0;
-                const uint64_t nb11 = src1 ? src1->nb[1] : 0;
-                const uint64_t nb12 = src1 ? src1->nb[2] : 0;
-                const uint64_t nb13 = src1 ? src1->nb[3] : 0; (void)nb13;
-
-                const int64_t ne0 = dst ? dst->ne[0] : 0;
-                const int64_t ne1 = dst ? dst->ne[1] : 0;
-                const int64_t ne2 = dst ? dst->ne[2] : 0;
-                const int64_t ne3 = dst ? dst->ne[3] : 0;
-
-                const uint64_t nb0 = dst ? dst->nb[0] : 0;
-                const uint64_t nb1 = dst ? dst->nb[1] : 0;
-                const uint64_t nb2 = dst ? dst->nb[2] : 0;
-                const uint64_t nb3 = dst ? dst->nb[3] : 0;
-
-                const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-                const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
-                const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
-
-                std::shared_ptr<kp::Tensor> id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullptr;
-                std::shared_ptr<kp::Tensor> id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullptr;
-                std::shared_ptr<kp::Tensor> id_dst  = dst  ? ggml_vk_get_tensor(ctx, dst)  : nullptr;
-
-                switch (dst->op) {
-                    case GGML_OP_RESHAPE:
-                    case GGML_OP_VIEW:
-                    case GGML_OP_TRANSPOSE:
-                    case GGML_OP_PERMUTE:
-                        {
-                            // noop
-                        } break;
-                    case GGML_OP_ADD:
-                        {
-                            ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
-                        } break;
-                    case GGML_OP_MUL:
-                        {
-                            if (ggml_nelements(src1) == ne10) {
-                                // src1 is a row
-                                ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ggml_nelements(dst));
-                            } else {
-                                ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
-                            }
-                        } break;
-                    case GGML_OP_SCALE:
-                        {
-                            const float scale = *(const float *) src1->data;
-                            ggml_vk_scale(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst), scale);
-                        } break;
-                    case GGML_OP_SILU:
-                        {
-                            ggml_vk_silu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
-                        } break;
-                    case GGML_OP_RELU:
-                        {
-                            ggml_vk_relu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
-                        } break;
-                    case GGML_OP_GELU:
-                        {
-                            ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
-                        } break;
-                    default:
-                        fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                        //GGML_ASSERT(false);
-                }
+        size_t offs_src0 = 0;
+        size_t offs_src1 = 0;
+        size_t offs_dst  = 0;
+
+        auto& seq = *sequences[seq_idx];
+
+        const int node_start = (seq_idx + 0) * n_nodes_per_seq;
+        const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
+
+        for (int i = node_start; i < node_end; ++i) {
+            printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+
+            struct ggml_tensor * src0 = gf->nodes[i]->src0;
+            struct ggml_tensor * src1 = gf->nodes[i]->src1;
+            struct ggml_tensor * dst = gf->nodes[i];
+
+            const int64_t ne00 = src0 ? src0->ne[0] : 0;
+            const int64_t ne01 = src0 ? src0->ne[1] : 0;
+            const int64_t ne02 = src0 ? src0->ne[2] : 0;
+            const int64_t ne03 = src0 ? src0->ne[3] : 0;
+
+            const uint64_t nb00 = src0 ? src0->nb[0] : 0;
+            const uint64_t nb01 = src0 ? src0->nb[1] : 0;
+            const uint64_t nb02 = src0 ? src0->nb[2] : 0;
+            const uint64_t nb03 = src0 ? src0->nb[3] : 0;
+
+            const int64_t ne10 = src1 ? src1->ne[0] : 0;
+            const int64_t ne11 = src1 ? src1->ne[1] : 0;
+            const int64_t ne12 = src1 ? src1->ne[2] : 0;
+            const int64_t ne13 = src1 ? src1->ne[3] : 0;  (void)ne13;
+
+            const uint64_t nb10 = src1 ? src1->nb[0] : 0;
+            const uint64_t nb11 = src1 ? src1->nb[1] : 0;
+            const uint64_t nb12 = src1 ? src1->nb[2] : 0;
+            const uint64_t nb13 = src1 ? src1->nb[3] : 0; (void)nb13;
+
+            const int64_t ne0 = dst ? dst->ne[0] : 0;
+            const int64_t ne1 = dst ? dst->ne[1] : 0;
+            const int64_t ne2 = dst ? dst->ne[2] : 0;
+            const int64_t ne3 = dst ? dst->ne[3] : 0;
+
+            const uint64_t nb0 = dst ? dst->nb[0] : 0;
+            const uint64_t nb1 = dst ? dst->nb[1] : 0;
+            const uint64_t nb2 = dst ? dst->nb[2] : 0;
+            const uint64_t nb3 = dst ? dst->nb[3] : 0;
+
+            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+            const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
+
+
+            const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
+            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_dst  = dst  ? ggml_vk_get_tensor(ctx, dst)  : nullTensor;
+
+            switch (dst->op) {
+                case GGML_OP_RESHAPE:
+                case GGML_OP_VIEW:
+                case GGML_OP_TRANSPOSE:
+                case GGML_OP_PERMUTE:
+                    {
+                        // noop
+                    } break;
+                case GGML_OP_ADD:
+                    {
+                        ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
+                    } break;
+                case GGML_OP_MUL:
+                    {
+                        if (ggml_nelements(src1) == ne10) {
+                            // src1 is a row
+                            ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ggml_nelements(dst));
+                        } else {
+                            ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
+                        }
+                    } break;
+                case GGML_OP_SCALE:
+                    {
+                        const float scale = *(const float *) src1->data;
+                        ggml_vk_scale(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst), scale);
+                    } break;
+                case GGML_OP_SILU:
+                    {
+                        ggml_vk_silu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
+                    } break;
+                case GGML_OP_RELU:
+                    {
+                        ggml_vk_relu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
+                    } break;
+                case GGML_OP_GELU:
+                    {
+                        ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
+                    } break;
+                //default:
+                    //fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                    //GGML_ASSERT(false);
             }
+        }
 
-            // Evaluate sequence
-            seq.eval();
-        });
+        // Evaluate sequence
+        seq.evalAsync();
     }
 
-    // Wait for all threads to finish
-    for (auto& thread : threads) {
-        if (thread.joinable())
-            thread.join();
+    // Wait for all sequences to finish
+    for (auto& sequence : sequences) {
+        if (sequence->isRunning())
+            sequence->evalAwait();
     }
 }
 

From acb7d90398980ffe8ea41785b43430672ca8e7f8 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 20:39:32 +0200
Subject: [PATCH 26/43] Reenabled unknown op message

---
 ggml-vulkan.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 4d4f31e77a6ab..056dd9244477e 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -611,8 +611,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
                     } break;
-                //default:
-                    //fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                default:
+                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                     //GGML_ASSERT(false);
             }
         }

From 072007b1e8118164c10f126a3b9de8763646d034 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 23 Jun 2023 21:21:16 +0200
Subject: [PATCH 27/43] Add buffer qualifiers

---
 ggml-vulkan.cpp | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 056dd9244477e..b5b2dc5fcb5ef 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -227,9 +227,9 @@ static const std::string program_source_head = R"(
 static const std::string program_dequantize_row_q4_0 =
         MULTILINE_QUOTE(
 layout(local_size_x = 1, local_size_y = 1) in;
-layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
-layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
-layout(binding = 2) buffer tensorY { float y[]; };
+layout(binding = 0) buffer restrict readonly tensorBlockQ4_0D { float16_t x_d[]; };
+layout(binding = 1) buffer restrict readonly tensorBlockQ4_0QS { uint8_t x_qs[]; };
+layout(binding = 2) buffer restrict writeonly tensorY { float y[]; };
 
 void main() {
     const int qk = QK4_0;
@@ -275,10 +275,10 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
 static const std::string program_dequantize_row_q4_1 =
         MULTILINE_QUOTE(
 layout(local_size_x = 1, local_size_y = 1) in;
-layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
-layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; };
-layout(binding = 2) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
-layout(binding = 3) buffer tensorY { float y[]; };
+layout(binding = 0) buffer restrict readonly tensorBlockQ4_0D { float16_t x_d[]; };
+layout(binding = 1) buffer restrict readonly tensorBlockQ4_0M { float16_t x_m[]; };
+layout(binding = 2) buffer restrict readonly tensorBlockQ4_0QS { uint8_t x_qs[]; };
+layout(binding = 3) buffer restrict writeonly tensorY { float y[]; };
 
 void main() {
     const int qk = QK4_1;
@@ -333,9 +333,9 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 layout(local_size_x = 1) in;
-layout(binding = 0) buffer tensorInA { float inA[]; };
-layout(binding = 1) buffer tensorInB { float inB[]; };
-layout(binding = 2) buffer tensorOut { float out_[]; };
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 
 void main() {
     const uint i = gl_GlobalInvocationID.x;
@@ -384,8 +384,8 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 layout(local_size_x = 1) in;
-layout(binding = 0) buffer tensorIn { float in_[]; };
-layout(binding = 1) buffer tensorOut { float out_[]; };
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 
 void main() {
     const uint i = gl_GlobalInvocationID.x;
@@ -432,8 +432,8 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 layout(local_size_x = 1) in;
-layout(binding = 0) buffer tensorInA { float in_[]; };
-layout(binding = 1) buffer tensorOut { float out_[]; };
+layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 
 void main() {
     const uint i = gl_GlobalInvocationID.x;
@@ -459,8 +459,8 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 layout(local_size_x = 1) in;
-layout(binding = 0) buffer tensorInA { float in_[]; };
-layout(binding = 1) buffer tensorOut { float out_[]; };
+layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 
 void main() {
     const uint i = gl_GlobalInvocationID.x;
@@ -485,8 +485,8 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 layout(local_size_x = 1) in;
-layout(binding = 0) buffer tensorInA { float in_[]; };
-layout(binding = 1) buffer tensorOut { float out_[]; };
+layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 
 void main() {
     const uint i = gl_GlobalInvocationID.x;

From ed14f0764ad94d5550016226e3c14c1c6e87ce35 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Wed, 28 Jun 2023 10:15:23 +0200
Subject: [PATCH 28/43] Fixed ggml_vk_abmath row argument

---
 ggml-vulkan.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index b5b2dc5fcb5ef..15433d544bdfb 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -344,15 +344,17 @@ void main() {
 }
 );
 
-template<char mathOP>
+template<char mathOP, bool with_row = false>
 void ggml_vk_abmath(kp::Sequence& seq,
                     const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
                     const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
                     const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
                     uint32_t size, uint32_t row = 0) {
+    GGML_ASSERT(with_row?row:!row);
+
     const static auto spirv = compileSource(program_source_head+
                                             "#define MATH_OP "+std::string(1, mathOP)+"\n"
-                                            "#define ROW_OP "+(row?"% pcs.row":"")+'\n'+
+                                            "#define ROW_OP "+(with_row?"% pcs.row":"")+'\n'+
                                             program_abmath, __func__);
 
     struct PushConstants {
@@ -369,9 +371,9 @@ void ggml_vk_add(Args&&... args) {
     return ggml_vk_abmath<'+'>(std::forward<Args>(args)...);
 }
 
-template <typename... Args>
+template <bool with_row = false, typename... Args>
 void ggml_vk_mul(Args&&... args) {
-    return ggml_vk_abmath<'*'>(std::forward<Args>(args)...);
+    return ggml_vk_abmath<'*', with_row>(std::forward<Args>(args)...);
 }
 
 
@@ -589,7 +591,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         if (ggml_nelements(src1) == ne10) {
                             // src1 is a row
-                            ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ggml_nelements(dst));
+                            ggml_vk_mul<true>(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst), ne00);
                         } else {
                             ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
                         }

From e2b721db654129f6d1a4c55dbd51bb503406104b Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Wed, 28 Jun 2023 10:19:18 +0200
Subject: [PATCH 29/43] Allow vk add row

---
 ggml-vulkan.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 15433d544bdfb..3c7beeddeea93 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -366,9 +366,9 @@ void ggml_vk_abmath(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {size}, {}, {pushConsts}));
 }
 
-template <typename... Args>
+template <bool with_row = false, typename... Args>
 void ggml_vk_add(Args&&... args) {
-    return ggml_vk_abmath<'+'>(std::forward<Args>(args)...);
+    return ggml_vk_abmath<'+', with_row>(std::forward<Args>(args)...);
 }
 
 template <bool with_row = false, typename... Args>

From de7d1823ed7e6c9054e10368ebe34e0c666af7b2 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Wed, 28 Jun 2023 12:48:41 +0200
Subject: [PATCH 30/43] Implemented ggml_vk_soft_max

---
 ggml-vulkan.cpp | 144 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 127 insertions(+), 17 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 3c7beeddeea93..1cc54d06f2636 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -212,15 +212,28 @@ std::vector<uint8_t> getVecBlockQ4_0QS(T *x, unsigned nb, unsigned qk) {
 };
 
 
-static const std::string program_source_head = R"(
-#version 450
+static const std::string program_source_head = R"(#version 450
+
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: enable
 #extension GL_EXT_shader_explicit_arithmetic_types_int8: enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int64: enable
+
 #define QK4_0 32
 #define QR4_0 2
 #define QK4_1 32
+
 #define GELU_COEF_A 0.044715;
 #define SQRT_2_OVER_PI 0.79788456080286535587989211986876;
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
 )";
 
 
@@ -366,16 +379,6 @@ void ggml_vk_abmath(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {size}, {}, {pushConsts}));
 }
 
-template <bool with_row = false, typename... Args>
-void ggml_vk_add(Args&&... args) {
-    return ggml_vk_abmath<'+', with_row>(std::forward<Args>(args)...);
-}
-
-template <bool with_row = false, typename... Args>
-void ggml_vk_mul(Args&&... args) {
-    return ggml_vk_abmath<'*', with_row>(std::forward<Args>(args)...);
-}
-
 
 static const std::string program_scale =
         MULTILINE_QUOTE(
@@ -456,8 +459,8 @@ void ggml_vk_silu(Args&&... args) {
 static const std::string program_relu =
         MULTILINE_QUOTE(
 layout(push_constant) uniform PushConstants {
-    uint inAOff;
     uint inOff;
+    uint outOff;
 } pcs;
 
 layout(local_size_x = 1) in;
@@ -482,8 +485,8 @@ void ggml_vk_relu(Args&&... args) {
 static const std::string program_gelu =
         MULTILINE_QUOTE(
 layout(push_constant) uniform PushConstants {
-    uint inAOff;
     uint inOff;
+    uint outOff;
 } pcs;
 
 layout(local_size_x = 1) in;
@@ -506,6 +509,109 @@ void ggml_vk_gelu(Args&&... args) {
 }
 
 
+static const std::string program_soft_max =
+        MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint64_t ne00;
+    uint64_t ne01;
+    uint64_t ne02;
+    uint inOff;
+    uint outOff;
+} pcs;
+
+layout(local_size_x = nth) in;
+layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+shared float buf[nth];
+
+void main() {
+    const uint64_t i03 = uint64_t(gl_GlobalInvocationID.z);
+    const uint64_t i02 = uint64_t(gl_GlobalInvocationID.y);
+    const uint64_t i01 = uint64_t(gl_GlobalInvocationID.x);
+
+    const uint extra_off = uint(i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00);
+    const uint in_off = pcs.inOff + extra_off;
+    const uint out_off = pcs.outOff + extra_off;
+
+    // parallel max
+    buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[in_off + i00]);
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast (no effect?)
+    if (gl_LocalInvocationID.x == 0) {
+        buf[0] = buf[0]; // ???
+    }
+
+    barrier();
+    memoryBarrierShared();
+
+    const float max_ = buf[0];
+
+    // parallel sum
+    buf[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        buf[gl_LocalInvocationID.x] += exp(in_[in_off + i00] - max_);
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast (no effect?)
+    if (gl_LocalInvocationID.x == 0) {
+        buf[0] = buf[0]; // ???
+    }
+
+    barrier();
+    memoryBarrierShared();
+
+    const float sum = buf[0];
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[out_off + i00] = exp(in_[in_off + i00] - max_) / sum;
+    }
+}
+);
+
+void ggml_vk_soft_max(kp::Sequence& seq,
+                      const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
+                      const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                      int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03) {
+    const static unsigned nth = 32;
+    const static auto spirv = compileSource(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_soft_max, __func__);
+
+    struct PushConstants {
+        int64_t ne00, ne01, ne02;
+        uint32_t inOff, outOff;
+    } pushConsts {
+        ne00, ne01, ne02, inOff, outOff
+    };
+
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}));
+}
+
+
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
     printf("%s: evaluating graph\n", __func__);
 
@@ -585,15 +691,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_ADD:
                     {
-                        ggml_vk_add(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
+                        ggml_vk_abmath<'+'>(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
                     } break;
                 case GGML_OP_MUL:
                     {
                         if (ggml_nelements(src1) == ne10) {
                             // src1 is a row
-                            ggml_vk_mul<true>(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst), ne00);
+                            ggml_vk_abmath<'*', true>(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst), ne00);
                         } else {
-                            ggml_vk_mul(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
+                            ggml_vk_abmath<'*'>(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ggml_nelements(dst));
                         }
                     } break;
                 case GGML_OP_SCALE:
@@ -613,6 +719,10 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         ggml_vk_gelu(seq, id_src0, offs_src0, id_dst, offs_dst, ggml_nelements(dst));
                     } break;
+                case GGML_OP_SOFT_MAX:
+                    {
+                        ggml_vk_soft_max(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ne02, ne03);
+                    } break;
                 default:
                     fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                     //GGML_ASSERT(false);

From 5ac68ccacb7b86037a402a8792ffa1fb102f4394 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Thu, 29 Jun 2023 11:14:21 +0200
Subject: [PATCH 31/43] Cleanups

---
 ggml-vulkan.h |  2 --
 ggml.c        |  6 ++----
 llama.cpp     | 12 ++----------
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index 5ec392782e0cd..361d8b5e2c94e 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -32,8 +32,6 @@ bool ggml_vk_add_buffer(
 void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 
-void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
-void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k);
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
 
 #ifdef  __cplusplus
diff --git a/ggml.c b/ggml.c
index 34f0870002a41..416a205769414 100644
--- a/ggml.c
+++ b/ggml.c
@@ -161,8 +161,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #endif
 #elif defined(GGML_USE_OPENBLAS)
 #include <cblas.h>
-#elif defined(GGML_USE_KOMPUTE)
-#include "ggml-vulkan.h"
 #elif defined(GGML_USE_CUBLAS)
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
@@ -1550,7 +1548,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
 
 static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = {
-        .dequantize_row_q         = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_0,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_0,
         .quantize_row_q           = quantize_row_q4_0,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
         .quantize_row_q_dot       = quantize_row_q8_0,
@@ -1558,7 +1556,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q4_1] = {
-        .dequantize_row_q         = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_1,
+        .dequantize_row_q         = (dequantize_row_q_t) dequantize_row_q4_1,
         .quantize_row_q           = quantize_row_q4_1,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
         .quantize_row_q_dot       = quantize_row_q8_1,
diff --git a/llama.cpp b/llama.cpp
index 740726445465a..40e3a4a7d11bd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -753,7 +753,7 @@ struct llama_model_loader {
         }
     }
 
-    void load_all_data(llama_context & lctx, llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
+    void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
         size_t data_size = 0;
         size_t prefetch_size = 0;
         size_t lock_size = 0;
@@ -810,14 +810,6 @@ struct llama_model_loader {
                         free(lt.data);
                     }
                     break;
-#elif defined(GGML_USE_KOMPUTE)
-                case GGML_BACKEND_GPU:
-                    lt.ggml_tensor->data = lt.data;
-                    ggml_vk_h2d_tensor(lctx.ctx_kompute, lt.ggml_tensor);
-                    if (!use_mmap) {
-                        free(lt.data);
-                    }
-                    break;
 #endif
                 default:
                     continue;
@@ -1323,7 +1315,7 @@ static void llama_model_load_internal(
     }
 #endif
 
-    ml->load_all_data(lctx, progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
+    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
 
     if (progress_callback) {
         progress_callback(1.0f, progress_callback_user_data);

From 749d6179a84b2d09eab3367c4619480cf6f75ff3 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Thu, 29 Jun 2023 14:23:00 +0200
Subject: [PATCH 32/43] Snake case all functions

---
 ggml-vulkan.cpp | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 1cc54d06f2636..70247a40d08ac 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -165,7 +165,7 @@ const std::shared_ptr<kp::Tensor> & ggml_vk_get_tensor(struct ggml_kompute_conte
 }
 
 
-static std::vector<uint32_t> compileSource(const std::string& source, const char *debug_name) {
+static std::vector<uint32_t> glsl_compile_source(const std::string& source, const char *debug_name) {
     printf("%s: Compiling compute program: %s\n", __func__, debug_name);
     static std::mutex mutex;
     std::lock_guard<std::mutex> L(mutex);
@@ -183,7 +183,7 @@ static std::vector<uint32_t> compileSource(const std::string& source, const char
 
 
 template<class T>
-std::vector<half> getVecBlockQ4_0D(T *x, unsigned nb) {
+std::vector<half> get_vec_block_Q4_0D(T *x, unsigned nb) {
     std::vector<half> fres(nb);
     for (unsigned it = 0; it != nb; it++) {
         fres[it] = x[it].d;
@@ -192,7 +192,7 @@ std::vector<half> getVecBlockQ4_0D(T *x, unsigned nb) {
 }
 
 template<class T>
-std::vector<half> getVecBlockQ4_0M(T *x, unsigned nb) {
+std::vector<half> get_vec_block_Q4_0M(T *x, unsigned nb) {
     std::vector<half> fres(nb);
     for (unsigned it = 0; it != nb; it++) {
         fres[it] = x[it].m;
@@ -201,7 +201,7 @@ std::vector<half> getVecBlockQ4_0M(T *x, unsigned nb) {
 }
 
 template<class T>
-std::vector<uint8_t> getVecBlockQ4_0QS(T *x, unsigned nb, unsigned qk) {
+std::vector<uint8_t> get_vec_block_Q4_0QS(T *x, unsigned nb, unsigned qk) {
     std::vector<uint8_t> fres(nb*(qk/2));
     for (unsigned x_it = 0; x_it != nb; x_it++) {
         for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) {
@@ -265,14 +265,14 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
     static const int qk = QK4_0;
     const unsigned nb = k / qk;
     const unsigned y_size = nb*qk;
-    const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_0, __func__);
+    const static auto spirv = glsl_compile_source(program_source_head+program_dequantize_row_q4_0, __func__);
 
     const auto x = reinterpret_cast<const block_q4_0*>(x_);
 
     GGML_ASSERT(k % qk == 0);
 
-    const auto tensorBlockQ4_0D = mgr.tensorT<half>(getVecBlockQ4_0D(x, nb));
-    const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(getVecBlockQ4_0QS(x, nb, qk));
+    const auto tensorBlockQ4_0D = mgr.tensorT<half>(get_vec_block_Q4_0D(x, nb));
+    const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(get_vec_block_Q4_0QS(x, nb, qk));
     const auto tensorY = mgr.tensor(std::vector<float>(y, y+y_size));
 
     mgr.sequence()
@@ -315,15 +315,15 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) {
     static const int qk = QK4_1;
     const unsigned nb = k / qk;
     const unsigned y_size = nb*qk;
-    const static auto spirv = compileSource(program_source_head+program_dequantize_row_q4_1, __func__);
+    const static auto spirv = glsl_compile_source(program_source_head+program_dequantize_row_q4_1, __func__);
 
     const auto x = reinterpret_cast<const block_q4_1*>(x_);
 
     GGML_ASSERT(k % qk == 0);
 
-    const auto tensorBlockQ4_0D = mgr.tensorT<half>(getVecBlockQ4_0D(x, nb));
-    const auto tensorBlockQ4_0M = mgr.tensorT<half>(getVecBlockQ4_0M(x, nb));
-    const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(getVecBlockQ4_0QS(x, nb, qk));
+    const auto tensorBlockQ4_0D = mgr.tensorT<half>(get_vec_block_Q4_0D(x, nb));
+    const auto tensorBlockQ4_0M = mgr.tensorT<half>(get_vec_block_Q4_0M(x, nb));
+    const auto tensorBlockQ4_0QS = mgr.tensorT<uint8_t>(get_vec_block_Q4_0QS(x, nb, qk));
     const auto tensorY = mgr.tensor(std::vector<float>(y, y+y_size));
 
     mgr.sequence()
@@ -365,7 +365,7 @@ void ggml_vk_abmath(kp::Sequence& seq,
                     uint32_t size, uint32_t row = 0) {
     GGML_ASSERT(with_row?row:!row);
 
-    const static auto spirv = compileSource(program_source_head+
+    const static auto spirv = glsl_compile_source(program_source_head+
                                             "#define MATH_OP "+std::string(1, mathOP)+"\n"
                                             "#define ROW_OP "+(with_row?"% pcs.row":"")+'\n'+
                                             program_abmath, __func__);
@@ -403,7 +403,7 @@ void ggml_vk_scale(kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
                    const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
                    uint32_t size, float scale) {
-    const static auto spirv = compileSource(program_source_head+program_scale, __func__);
+    const static auto spirv = glsl_compile_source(program_source_head+program_scale, __func__);
 
     struct PushConstants {
         uint32_t inOff, outOff;
@@ -450,7 +450,7 @@ void main() {
 
 template <typename... Args>
 void ggml_vk_silu(Args&&... args) {
-    const static auto spirv = compileSource(program_source_head+program_silu, __func__);
+    const static auto spirv = glsl_compile_source(program_source_head+program_silu, __func__);
 
     ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
 }
@@ -476,7 +476,7 @@ void main() {
 
 template <typename... Args>
 void ggml_vk_relu(Args&&... args) {
-    const static auto spirv = compileSource(program_source_head+program_relu, __func__);
+    const static auto spirv = glsl_compile_source(program_source_head+program_relu, __func__);
 
     ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
 }
@@ -503,7 +503,7 @@ void main() {
 
 template <typename... Args>
 void ggml_vk_gelu(Args&&... args) {
-    const static auto spirv = compileSource(program_source_head+program_gelu, __func__);
+    const static auto spirv = glsl_compile_source(program_source_head+program_gelu, __func__);
 
     ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
 }
@@ -599,7 +599,7 @@ void ggml_vk_soft_max(kp::Sequence& seq,
                       const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
                       int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03) {
     const static unsigned nth = 32;
-    const static auto spirv = compileSource(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_soft_max, __func__);
+    const static auto spirv = glsl_compile_source(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_soft_max, __func__);
 
     struct PushConstants {
         int64_t ne00, ne01, ne02;

From 964fe8c546dba2e88e13d6f6d09a62c45008ac61 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 30 Jun 2023 11:47:10 +0200
Subject: [PATCH 33/43] Added mul_mat (needs fixes)

---
 ggml-vulkan.cpp | 357 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 343 insertions(+), 14 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 70247a40d08ac..d6b99aa1fe365 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -217,6 +217,7 @@ static const std::string program_source_head = R"(#version 450
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: enable
 #extension GL_EXT_shader_explicit_arithmetic_types_int8: enable
 #extension GL_EXT_shader_explicit_arithmetic_types_int64: enable
+#extension GL_EXT_control_flow_attributes: enable
 
 #define QK4_0 32
 #define QR4_0 2
@@ -336,6 +337,44 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) {
 }
 
 
+static const std::string program_fpx_to_fpx =
+    MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+layout(local_size_x = 1) in;
+layout(binding = 0) buffer restrict readonly tensorIn { IN_TYPE in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { OUT_TYPE out_[]; };
+
+void main() {
+    const uint i = gl_GlobalInvocationID.x;
+
+    out_[pcs.outOff + i] = OUT_TYPE(in_[pcs.inOff + i]);
+}
+);
+
+void ggml_vk_fp32_to_fp16_row(kp::Sequence& seq,
+                              const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
+                              const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                              uint32_t size) {
+    const static auto spirv = glsl_compile_source(program_source_head+
+                                                      "#define IN_TYPE float\n"
+                                                      "#define OUT_TYPE float16_t\n"+
+                                                      program_fpx_to_fpx, __func__);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+    } const pushConsts {
+        inOff, outOff
+    };
+
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {size}, {}, {pushConsts}));
+}
+
+
 static const std::string program_abmath =
         MULTILINE_QUOTE(
 layout(push_constant) uniform PushConstants {
@@ -535,24 +574,24 @@ void main() {
     const uint out_off = pcs.outOff + extra_off;
 
     // parallel max
-    buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[in_off + i00]);
+    buf[gl_WorkGroupID.x] = uintBitsToFloat(0xFF800000);
+    for (uint i00 = gl_WorkGroupID.x; i00 < pcs.ne00; i00 += nth) {
+        buf[gl_WorkGroupID.x] = max(buf[gl_WorkGroupID.x], in_[in_off + i00]);
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
-    for (uint i = nth/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_WorkGroupID.x < i) {
+            buf[gl_WorkGroupID.x] = max(buf[gl_WorkGroupID.x], buf[gl_WorkGroupID.x + i]);
         }
         barrier();
         memoryBarrierShared();
     }
 
     // broadcast (no effect?)
-    if (gl_LocalInvocationID.x == 0) {
+    if (gl_WorkGroupID.x == 0) {
         buf[0] = buf[0]; // ???
     }
 
@@ -562,24 +601,24 @@ void main() {
     const float max_ = buf[0];
 
     // parallel sum
-    buf[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        buf[gl_LocalInvocationID.x] += exp(in_[in_off + i00] - max_);
+    buf[gl_WorkGroupID.x] = 0.0;
+    for (uint i00 = gl_WorkGroupID.x; i00 < pcs.ne00; i00 += nth) {
+        buf[gl_WorkGroupID.x] += exp(in_[in_off + i00] - max_);
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
     for (uint i = nth/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
+        if (gl_WorkGroupID.x < i) {
+            buf[gl_WorkGroupID.x] += buf[gl_WorkGroupID.x + i];
         }
         barrier();
         memoryBarrierShared();
     }
 
     // broadcast (no effect?)
-    if (gl_LocalInvocationID.x == 0) {
+    if (gl_WorkGroupID.x == 0) {
         buf[0] = buf[0]; // ???
     }
 
@@ -588,7 +627,7 @@ void main() {
 
     const float sum = buf[0];
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_WorkGroupID.x; i00 < pcs.ne00; i00 += nth) {
         out_[out_off + i00] = exp(in_[in_off + i00] - max_) / sum;
     }
 }
@@ -612,6 +651,285 @@ void ggml_vk_soft_max(kp::Sequence& seq,
 }
 
 
+static const std::string program_mul_mat_f16 = R"(
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+)" MULTILINE_QUOTE(
+layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float16_t inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    int M;
+    int N;
+    int K;
+    int inAStride;
+    int inBStride;
+    int outStride;
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+} pcs;
+
+shared float16_t bufA[BM * (BK+1)];
+shared float16_t bufB[BN * (BK+1)];
+
+void main() {
+    const int ir = int(gl_WorkGroupID.x);
+    const int ic = int(gl_WorkGroupID.y);
+
+    const int rstride = BM / TM;
+
+    const int lr = int(gl_LocalInvocationID.x % rstride);
+    const int lc = int(gl_LocalInvocationID.x / rstride);
+
+    const int loadr = int(gl_LocalInvocationID.x % BK);
+    const int loadc = int(gl_LocalInvocationID.x / BK);
+
+    const int loadstride = int(gl_WorkGroupSize.x);
+
+    int posA = ir * BM * pcs.inAStride;
+    int posB = ic * BN * pcs.inBStride;
+
+    float sums[TM * TN];
+    float16_t cacheA[TM];
+    float16_t cacheB[TN];
+
+    [[unroll]] for (int i = 0; i < TM*TN; i++) {
+        sums[i] = 0.0hf;
+    }
+
+    [[unroll]] for (int block = 0; block < pcs.K; block += BK) {
+        [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) {
+            const int lr = l % BK;
+            const int lc = l / BK;
+            bufA[(loadc + lc) * (BK+1) + loadr + lr] = inA[posA + (loadc + lc) * pcs.inAStride + loadr + lr];
+        }
+        [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) {
+            const int lr = l % BK;
+            const int lc = l / BK;
+            bufB[(loadc + lc) * (BK+1) + loadr + lr] = inB[posB + (loadc + lc) * pcs.inBStride + loadr + lr];
+        }
+
+        barrier();
+
+        posA += BK;
+        posB += BK;
+
+        [[unroll]] for (int i = 0; i < BK; i++) {
+            // Load from shared into cache
+            [[unroll]] for (int j = 0; j < BM; j++) {
+                cacheA[j] = bufA[(lr + j*rstride) * (BK+1) + i];
+            }
+            [[unroll]] for (int j = 0; j < TN; j++) {
+                cacheB[j] = bufB[(lc * TN + j) * (BK+1) + i];
+            }
+
+            [[unroll]] for (int cc = 0; cc < TN; cc++) {
+                [[unroll]] for (int cr = 0; cr < TM; cr++) {
+                    sums[cc * TM + cr] += float(cacheA[cr]) * float(cacheB[cc]);
+                }
+            }
+        }
+
+        barrier();
+    }
+
+    const int dr = ir * BM + lr;
+    const int dc = ic * BN + lc * TN;
+
+    [[unroll]] for (int cc = 0; cc < TN; cc++) {
+        [[unroll]] for (int cr = 0; cr < TM; cr++) {
+            out_[(dc + cc) * pcs.outStride + dr + cr*rstride] = sums[cc * TM + cr];
+        }
+    }
+}
+);
+
+void ggml_vk_mul_mat_f16(kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
+                         const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
+                         const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                         int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03,
+                         int64_t ne10, int64_t ne11,
+                         int nb10, int nb11, int nb12, int nb13,
+                         int nb2, int nb3) {
+    const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f16, __func__);
+
+    const bool inB_cont_rows = nb10 == sizeof(float);
+    const bool inB_cont_cols = (size_t)nb11 == ne11 * sizeof(float);
+
+    struct PushConstants {
+        int32_t M, N, K, inAStride, inBStride, outStride;
+        uint32_t inAOff, inBOff, outOff;
+    } pushConsts {
+        (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01,
+        inAOff, inBOff, outOff
+    };
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            auto tmp = mgr.tensorT<half>(std::vector<half>(ne10*ne11));
+
+            if (inB_cont_rows) {
+                if (inB_cont_cols) {
+                    ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12)/sizeof(float), tmp, 0, ne10*ne11);
+                }
+                else {
+                    for (int64_t i01 = 0; i01 < ne11; i01++) {
+                        ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12 + i01*nb11)/sizeof(float), tmp, i01*ne10, ne10);
+                    }
+                }
+            } else {
+                for (int64_t i01 = 0; i01 < ne11; i01++) {
+                    for (int64_t i00 = 0; i00 < ne10; i00++) {
+                        // Extremely slow because of single shader invocation
+                        ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10)/sizeof(float), tmp, i01*ne10 + i00, 1);
+                    }
+                }
+            }
+
+            seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, tmp, out}, spirv, {(uint32_t)ne01, (uint32_t)ne11}, {}, {pushConsts}));
+        }
+    }
+}
+
+
+static const std::string program_mul_mat_f32 = R"(
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+)" MULTILINE_QUOTE(
+layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { float inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    int M;
+    int N;
+    int K;
+    int inAStride;
+    int inBStride;
+    int outStride;
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+} pcs;
+
+shared float bufA[BM * (BK+1)];
+shared float bufB[BN * (BK+1)];
+
+void main() {
+    const int ir = int(gl_WorkGroupID.x);
+    const int ic = int(gl_WorkGroupID.y);
+
+    const int rstride = BM / TM;
+
+    const int lr = int(gl_WorkGroupID.x % rstride);
+    const int lc = int(gl_WorkGroupID.x / rstride);
+
+    const int loadr = int(gl_WorkGroupID.x % BK);
+    const int loadc = int(gl_WorkGroupID.x / BK);
+
+    const int loadstride = int(gl_WorkGroupSize.x);
+
+    int posA = ir * BM * pcs.inAStride;
+    int posB = ic * BN * pcs.inBStride;
+
+    float sums[TM * TN];
+    float cacheA[TM];
+    float cacheB[TN];
+
+    [[unroll]] for (int i = 0; i < TM*TN; i++) {
+        sums[i] = 0.0f;
+    }
+
+    [[unroll]] for (int block = 0; block < pcs.K; block += BK) {
+        [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) {
+            const int lr = l % BK;
+            const int lc = l / BK;
+            bufA[(loadc + lc) * (BK+1) + loadr + lr] = inA[posA + (loadc + lc) * pcs.inAStride + loadr + lr + pcs.inAOff];
+        }
+        [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) {
+            const int lr = l % BK;
+            const int lc = l / BK;
+            bufB[(loadc + lc) * (BK+1) + loadr + lr] = inB[posB + (loadc + lc) * pcs.inBStride + loadr + lr + pcs.inBOff];
+        }
+
+        barrier();
+        memoryBarrierShared();
+
+        posA += BK;
+        posB += BK;
+
+        [[unroll]] for (int i = 0; i < BK; i++) {
+            // Load from shared into cache
+            [[unroll]] for (int j = 0; j < BM; j++) {
+                cacheA[j] = bufA[(lr + j*rstride) * (BK+1) + i];
+            }
+            [[unroll]] for (int j = 0; j < TN; j++) {
+                cacheB[j] = bufB[(lc * TN + j) * (BK+1) + i];
+            }
+
+            [[unroll]] for (int cc = 0; cc < TN; cc++) {
+                [[unroll]] for (int cr = 0; cr < TM; cr++) {
+                    sums[cc * TM + cr] += cacheA[cr] * cacheB[cc];
+                }
+            }
+        }
+
+        barrier();
+    }
+
+    const int dr = ir * BM + lr;
+    const int dc = ic * BN + lc * TN;
+
+    [[unroll]] for (int cc = 0; cc < TN; cc++) {
+        [[unroll]] for (int cr = 0; cr < TM; cr++) {
+            out_[(dc + cc) * pcs.outStride + dr + cr*rstride + pcs.outOff] = sums[cc * TM + cr];
+        }
+    }
+}
+);
+
+void ggml_vk_mul_mat_f32(kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
+                         const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
+                         const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                         int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03,
+                         int64_t ne10, int64_t ne11,
+                         int nb2, int nb3) {
+    const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f32, __func__);
+
+    struct PushConstants {
+        int32_t M, N, K, inAStride, inBStride, outStride;
+        uint32_t inAOff, inBOff, outOff;
+    } pushConsts {
+        (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01,
+        inAOff, inBOff, outOff
+    };
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            auto off = i02*nb2 + i03*nb3;
+            pushConsts.inAOff = inAOff + off;
+            pushConsts.inBOff = inBOff + off;
+            pushConsts.outOff = outOff + off;
+            seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {(uint32_t)ne01, (uint32_t)ne11}, {}, {pushConsts}));
+        }
+    }
+}
+
+
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
     printf("%s: evaluating graph\n", __func__);
 
@@ -723,6 +1041,17 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         ggml_vk_soft_max(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ne02, ne03);
                     } break;
+                case GGML_OP_MUL_MAT:
+                    {
+                        if (src0->type == GGML_TYPE_F32
+                         && src1->type == GGML_TYPE_F32) {
+                            ggml_vk_mul_mat_f32(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb2, nb3);
+                            break;
+                        } else if (src0->type == GGML_TYPE_F32
+                                && src1->type == GGML_TYPE_F16) {
+                            ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb10, nb11, nb12, nb13, nb2, nb3);
+                        }
+                    }
                 default:
                     fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                     //GGML_ASSERT(false);

From f093bf2e5e8fc650d1274d3920cb20a8d82c1c35 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 30 Jun 2023 12:19:29 +0200
Subject: [PATCH 34/43] Minor MUL_MAT fix and implemented DIAG_MASK_INF

---
 ggml-vulkan.cpp | 84 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 66 insertions(+), 18 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index d6b99aa1fe365..7b92a7bac071a 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -223,8 +223,8 @@ static const std::string program_source_head = R"(#version 450
 #define QR4_0 2
 #define QK4_1 32
 
-#define GELU_COEF_A 0.044715;
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876;
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
 
 #ifndef QK_K
 #define QK_K 256
@@ -235,6 +235,12 @@ static const std::string program_source_head = R"(#version 450
 #else
 #define K_SCALE_SIZE 4
 #endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
 )";
 
 
@@ -651,13 +657,56 @@ void ggml_vk_soft_max(kp::Sequence& seq,
 }
 
 
-static const std::string program_mul_mat_f16 = R"(
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-)" MULTILINE_QUOTE(
+static const std::string program_diag_mask_inf =
+        MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint64_t ne00;
+    uint64_t ne01;
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+} pcs;
+
+layout(local_size_x = 1) in;
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+void main() {
+    const uint64_t i02 = uint64_t(gl_GlobalInvocationID.z);
+    const uint64_t i01 = uint64_t(gl_GlobalInvocationID.y);
+    const uint64_t i00 = uint64_t(gl_GlobalInvocationID.x);
+
+    const int n_past = inB[pcs.inBOff];
+
+    if (i00 > n_past + i01) {
+        out_[uint(i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00 + pcs.outOff)] = uintBitsToFloat(0xFF800000);
+    } else {
+        out_[uint(i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00 + pcs.outOff)] = inA[uint(i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00 + pcs.inAOff)];
+    }
+}
+);
+
+void ggml_vk_diag_mask_inf(kp::Sequence& seq,
+                           const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
+                           const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
+                           const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                           int64_t ne00, int64_t ne01, int64_t ne02) {
+    const static auto spirv = glsl_compile_source(program_source_head+program_diag_mask_inf, __func__);
+
+    struct PushConstants {
+        int64_t ne00, ne01;
+        uint32_t inAOff, inBOff, outOff;
+    } pushConsts {
+        ne00, ne01, inAOff, inBOff, outOff
+    };
+
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts}));
+}
+
+
+static const std::string program_mul_mat_f16 =
+    MULTILINE_QUOTE(
 layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
@@ -800,13 +849,8 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
 }
 
 
-static const std::string program_mul_mat_f32 = R"(
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-)" MULTILINE_QUOTE(
+static const std::string program_mul_mat_f32 =
+    MULTILINE_QUOTE(
 layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { float inA[]; };
@@ -1041,14 +1085,18 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         ggml_vk_soft_max(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ne02, ne03);
                     } break;
+                case GGML_OP_DIAG_MASK_INF:
+                    {
+                        ggml_vk_diag_mask_inf(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02);
+                    } break;
                 case GGML_OP_MUL_MAT:
                     {
                         if (src0->type == GGML_TYPE_F32
                          && src1->type == GGML_TYPE_F32) {
                             ggml_vk_mul_mat_f32(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb2, nb3);
                             break;
-                        } else if (src0->type == GGML_TYPE_F32
-                                && src1->type == GGML_TYPE_F16) {
+                        } else if (src0->type == GGML_TYPE_F16
+                                && src1->type == GGML_TYPE_F32) {
                             ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb10, nb11, nb12, nb13, nb2, nb3);
                         }
                     }

From 0dc5f2f2bad7c34b4caff0bd27b274e474335918 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 30 Jun 2023 12:31:13 +0200
Subject: [PATCH 35/43] Fixed mul mat dispatch size

---
 ggml-vulkan.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 7b92a7bac071a..18c7ba8fa0631 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -843,7 +843,7 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
                 }
             }
 
-            seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, tmp, out}, spirv, {(uint32_t)ne01, (uint32_t)ne11}, {}, {pushConsts}));
+            seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, tmp, out}, spirv, {uint32_t(ne01/128), uint32_t(ne11/128)}, {}, {pushConsts}));
         }
     }
 }
@@ -968,7 +968,7 @@ void ggml_vk_mul_mat_f32(kp::Sequence& seq,
             pushConsts.inAOff = inAOff + off;
             pushConsts.inBOff = inBOff + off;
             pushConsts.outOff = outOff + off;
-            seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {(uint32_t)ne01, (uint32_t)ne11}, {}, {pushConsts}));
+            seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {uint32_t(ne01/128), uint32_t(ne11/128)}, {}, {pushConsts}));
         }
     }
 }
@@ -1037,7 +1037,6 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
             const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
 
-
             const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
             const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullTensor;
             const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullTensor;

From 8fa60134b17c0de70e6f5a55c2ec6f241bd2915b Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 30 Jun 2023 12:47:17 +0200
Subject: [PATCH 36/43] Added missing break to mul_mat_f16 case

---
 ggml-vulkan.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 18c7ba8fa0631..4d9c458dfc6cd 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1097,6 +1097,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                         } else if (src0->type == GGML_TYPE_F16
                                 && src1->type == GGML_TYPE_F32) {
                             ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb10, nb11, nb12, nb13, nb2, nb3);
+                            break;
                         }
                     }
                 default:

From d1f84db4b6c970001c7a171155c7a4204a18aa35 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 30 Jun 2023 15:18:10 +0200
Subject: [PATCH 37/43] Implemented GGML_OP_NORM

---
 ggml-vulkan.cpp | 109 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 4 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 4d9c458dfc6cd..29c67e7768226 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -482,7 +482,7 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 layout(local_size_x = 1) in;
-layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; };
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 
 void main() {
@@ -509,7 +509,7 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 layout(local_size_x = 1) in;
-layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; };
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 
 void main() {
@@ -535,7 +535,7 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 layout(local_size_x = 1) in;
-layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; };
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 
 void main() {
@@ -565,7 +565,7 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 layout(local_size_x = nth) in;
-layout(binding = 0) buffer restrict readonly tensorInA { float in_[]; };
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
 
 shared float buf[nth];
@@ -657,6 +657,104 @@ void ggml_vk_soft_max(kp::Sequence& seq,
 }
 
 
+static const std::string program_norm =
+    MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint64_t ne00;
+    uint64_t nb01;
+    float eps;
+    uint inOff;
+    uint outOff;
+} pcs;
+
+layout(local_size_x = 1) in;
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict tensorOut { float out_[]; };
+
+shared float sum[nth];
+
+void main() {
+    const uint x = gl_GlobalInvocationID.x; // Based from in_
+    // MEAN
+    // parallel sum
+    sum[gl_GlobalInvocationID.y] = 0.0;
+    for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_GlobalInvocationID.y] += in_[x+i00];
+    }
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_GlobalInvocationID.y < i) {
+            sum[gl_GlobalInvocationID.y] += sum[gl_GlobalInvocationID.y + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+    // broadcast
+    if (gl_GlobalInvocationID.y == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+    const float mean = sum[0];
+
+    // recenter
+    const uint y = gl_GlobalInvocationID.x; // Based from out_
+    for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] = in_[x+i00] - mean;
+    }
+
+    // VARIANCE
+    // parallel sum
+    sum[gl_GlobalInvocationID.y] = 0.0;
+    for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_GlobalInvocationID.y] += out_[y+i00] * out_[y+i00];
+    }
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_GlobalInvocationID.y < i) {
+            sum[gl_GlobalInvocationID.y] += sum[gl_GlobalInvocationID.y + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+    // broadcast
+    if (gl_GlobalInvocationID.y == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+    const float variance = sum[0];
+
+    const float scale = 1.0/sqrt(variance + pcs.eps);
+    for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] *= scale;
+    }
+}
+);
+
+void ggml_vk_norm(kp::Sequence& seq,
+                  const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
+                  const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                  int64_t ne00, int64_t ne01,
+                  int64_t nrows) {
+    const static unsigned nth = 256;
+    const static auto spirv = glsl_compile_source(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_norm, __func__);
+
+    struct PushConstants {
+        uint64_t ne00, nb01;
+        float eps;
+        uint32_t inOff, outOff;
+    } pushConsts {
+        (uint64_t)ne00, (uint64_t)ne01, 1e-5f, inOff, outOff
+    };
+
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {(uint32_t)nrows, nth}, {}, {pushConsts}));
+}
+
 static const std::string program_diag_mask_inf =
         MULTILINE_QUOTE(
 layout(push_constant) uniform PushConstants {
@@ -1100,6 +1198,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             break;
                         }
                     }
+                case GGML_OP_NORM: {
+                        ggml_vk_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0));
+                    } break;
                 default:
                     fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                     //GGML_ASSERT(false);

From f0e1429d7fd56483ee8352c6bca40344a653f01a Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Fri, 30 Jun 2023 16:01:08 +0200
Subject: [PATCH 38/43] Implemented RMS_NORM

---
 ggml-vulkan.cpp | 97 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 89 insertions(+), 8 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 29c67e7768226..35d31157b3f52 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -657,6 +657,23 @@ void ggml_vk_soft_max(kp::Sequence& seq,
 }
 
 
+void ggml_vk_norm(kp::Sequence& seq, std::vector<uint32_t> spirv, unsigned nth,
+                  const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
+                  const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                  int64_t ne00, int64_t ne01,
+                  int64_t nrows) {
+    struct PushConstants {
+        uint64_t ne00, nb01;
+        float eps;
+        uint32_t inOff, outOff;
+    } pushConsts {
+        (uint64_t)ne00, (uint64_t)ne01, 1e-5f, inOff, outOff
+    };
+
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {(uint32_t)nrows, nth}, {}, {pushConsts}));
+}
+
+
 static const std::string program_norm =
     MULTILINE_QUOTE(
 layout(push_constant) uniform PushConstants {
@@ -681,6 +698,7 @@ void main() {
     for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) {
         sum[gl_GlobalInvocationID.y] += in_[x+i00];
     }
+
     // reduce
     barrier();
     memoryBarrierShared();
@@ -691,6 +709,7 @@ void main() {
         barrier();
         memoryBarrierShared();
     }
+
     // broadcast
     if (gl_GlobalInvocationID.y == 0) {
         sum[0] /= float(pcs.ne00);
@@ -711,6 +730,7 @@ void main() {
     for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) {
         sum[gl_GlobalInvocationID.y] += out_[y+i00] * out_[y+i00];
     }
+
     // reduce
     barrier();
     memoryBarrierShared();
@@ -721,6 +741,7 @@ void main() {
         barrier();
         memoryBarrierShared();
     }
+
     // broadcast
     if (gl_GlobalInvocationID.y == 0) {
         sum[0] /= float(pcs.ne00);
@@ -744,16 +765,73 @@ void ggml_vk_norm(kp::Sequence& seq,
     const static unsigned nth = 256;
     const static auto spirv = glsl_compile_source(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_norm, __func__);
 
-    struct PushConstants {
-        uint64_t ne00, nb01;
-        float eps;
-        uint32_t inOff, outOff;
-    } pushConsts {
-        (uint64_t)ne00, (uint64_t)ne01, 1e-5f, inOff, outOff
-    };
+    ggml_vk_norm(seq, spirv, nth, in, inOff, out, outOff, ne00, ne01, nrows);
+}
 
-    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({in, out}, spirv, {(uint32_t)nrows, nth}, {}, {pushConsts}));
+
+static const std::string program_rms_norm =
+    MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint64_t ne00;
+    uint64_t nb01;
+    float eps;
+    uint inOff;
+    uint outOff;
+} pcs;
+
+layout(local_size_x = 1) in;
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict tensorOut { float out_[]; };
+
+shared float sum[nth];
+
+void main() {
+    const uint x = gl_GlobalInvocationID.x; // Based from in_
+
+    // parallel sum
+    sum[gl_GlobalInvocationID.y] = 0.0;
+    for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_GlobalInvocationID.y] += in_[x+i00] * in_[x+i00];
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_GlobalInvocationID.y < i) {
+            sum[gl_GlobalInvocationID.y] += sum[gl_GlobalInvocationID.y + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    if (gl_GlobalInvocationID.y == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+
+    const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
+
+    const uint y = gl_GlobalInvocationID.x; // Based from out_
+    for (uint i00 = gl_GlobalInvocationID.y; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] = in_[x+i00] * scale;
+    }
 }
+);
+
+void ggml_vk_rms_norm(kp::Sequence& seq,
+                  const std::shared_ptr<kp::Tensor>& in, uint32_t inOff,
+                  const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
+                  int64_t ne00, int64_t ne01,
+                  int64_t nrows) {
+    const static unsigned nth = 256;
+    const static auto spirv = glsl_compile_source(program_source_head+"#define nth "+std::to_string(nth)+"\n"+program_rms_norm, __func__);
+
+    ggml_vk_norm(seq, spirv, nth, in, inOff, out, outOff, ne00, ne01, nrows);
+}
+
 
 static const std::string program_diag_mask_inf =
         MULTILINE_QUOTE(
@@ -1201,6 +1279,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_NORM: {
                         ggml_vk_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0));
                     } break;
+                case GGML_OP_RMS_NORM: {
+                    ggml_vk_rms_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0));
+                } break;
                 default:
                     fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                     //GGML_ASSERT(false);

From 2fc8249ba371bea3fd710819c30ce08930dbef73 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Wed, 5 Jul 2023 10:59:38 +0200
Subject: [PATCH 39/43] Simple mul_mat_f16 for speed and removal of unused
 mul_mat_f32

---
 ggml-vulkan.cpp | 169 +++++++++++-------------------------------------
 1 file changed, 38 insertions(+), 131 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 35d31157b3f52..517b98135f588 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -976,6 +976,42 @@ void main() {
 }
 );
 
+static const std::string program_fast_mul_mat_f16 =
+    MULTILINE_QUOTE(
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float16_t inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    int M;
+    int N;
+    int K;
+    int inAStride;
+    int inBStride;
+    int outStride;
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+} pcs;
+
+void main() {
+    int row = int(gl_GlobalInvocationID.x);
+    int col = int(gl_GlobalInvocationID.y);
+
+    if (row < pcs.M && col < pcs.N) {
+        float sum = 0.0f;
+
+        for (int i = 0; i < pcs.K; i++) {
+            sum += float(inA[row * pcs.inAStride + i]) * float(inB[col * pcs.inBStride + i]);
+        }
+
+        out_[col * pcs.outStride + row] = sum;
+    }
+}
+);
+
 void ggml_vk_mul_mat_f16(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
                          const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
@@ -984,7 +1020,7 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
                          int64_t ne10, int64_t ne11,
                          int nb10, int nb11, int nb12, int nb13,
                          int nb2, int nb3) {
-    const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f16, __func__);
+    const static auto spirv = glsl_compile_source(program_source_head+program_fast_mul_mat_f16, __func__);
 
     const bool inB_cont_rows = nb10 == sizeof(float);
     const bool inB_cont_cols = (size_t)nb11 == ne11 * sizeof(float);
@@ -1025,131 +1061,6 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
 }
 
 
-static const std::string program_mul_mat_f32 =
-    MULTILINE_QUOTE(
-layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { float inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
-    int M;
-    int N;
-    int K;
-    int inAStride;
-    int inBStride;
-    int outStride;
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-} pcs;
-
-shared float bufA[BM * (BK+1)];
-shared float bufB[BN * (BK+1)];
-
-void main() {
-    const int ir = int(gl_WorkGroupID.x);
-    const int ic = int(gl_WorkGroupID.y);
-
-    const int rstride = BM / TM;
-
-    const int lr = int(gl_WorkGroupID.x % rstride);
-    const int lc = int(gl_WorkGroupID.x / rstride);
-
-    const int loadr = int(gl_WorkGroupID.x % BK);
-    const int loadc = int(gl_WorkGroupID.x / BK);
-
-    const int loadstride = int(gl_WorkGroupSize.x);
-
-    int posA = ir * BM * pcs.inAStride;
-    int posB = ic * BN * pcs.inBStride;
-
-    float sums[TM * TN];
-    float cacheA[TM];
-    float cacheB[TN];
-
-    [[unroll]] for (int i = 0; i < TM*TN; i++) {
-        sums[i] = 0.0f;
-    }
-
-    [[unroll]] for (int block = 0; block < pcs.K; block += BK) {
-        [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) {
-            const int lr = l % BK;
-            const int lc = l / BK;
-            bufA[(loadc + lc) * (BK+1) + loadr + lr] = inA[posA + (loadc + lc) * pcs.inAStride + loadr + lr + pcs.inAOff];
-        }
-        [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) {
-            const int lr = l % BK;
-            const int lc = l / BK;
-            bufB[(loadc + lc) * (BK+1) + loadr + lr] = inB[posB + (loadc + lc) * pcs.inBStride + loadr + lr + pcs.inBOff];
-        }
-
-        barrier();
-        memoryBarrierShared();
-
-        posA += BK;
-        posB += BK;
-
-        [[unroll]] for (int i = 0; i < BK; i++) {
-            // Load from shared into cache
-            [[unroll]] for (int j = 0; j < BM; j++) {
-                cacheA[j] = bufA[(lr + j*rstride) * (BK+1) + i];
-            }
-            [[unroll]] for (int j = 0; j < TN; j++) {
-                cacheB[j] = bufB[(lc * TN + j) * (BK+1) + i];
-            }
-
-            [[unroll]] for (int cc = 0; cc < TN; cc++) {
-                [[unroll]] for (int cr = 0; cr < TM; cr++) {
-                    sums[cc * TM + cr] += cacheA[cr] * cacheB[cc];
-                }
-            }
-        }
-
-        barrier();
-    }
-
-    const int dr = ir * BM + lr;
-    const int dc = ic * BN + lc * TN;
-
-    [[unroll]] for (int cc = 0; cc < TN; cc++) {
-        [[unroll]] for (int cr = 0; cr < TM; cr++) {
-            out_[(dc + cc) * pcs.outStride + dr + cr*rstride + pcs.outOff] = sums[cc * TM + cr];
-        }
-    }
-}
-);
-
-void ggml_vk_mul_mat_f32(kp::Sequence& seq,
-                         const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
-                         const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
-                         const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
-                         int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03,
-                         int64_t ne10, int64_t ne11,
-                         int nb2, int nb3) {
-    const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f32, __func__);
-
-    struct PushConstants {
-        int32_t M, N, K, inAStride, inBStride, outStride;
-        uint32_t inAOff, inBOff, outOff;
-    } pushConsts {
-        (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01,
-        inAOff, inBOff, outOff
-    };
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            auto off = i02*nb2 + i03*nb3;
-            pushConsts.inAOff = inAOff + off;
-            pushConsts.inBOff = inBOff + off;
-            pushConsts.outOff = outOff + off;
-            seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {uint32_t(ne01/128), uint32_t(ne11/128)}, {}, {pushConsts}));
-        }
-    }
-}
-
-
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
     printf("%s: evaluating graph\n", __func__);
 
@@ -1266,11 +1177,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_MUL_MAT:
                     {
-                        if (src0->type == GGML_TYPE_F32
-                         && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_f32(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb2, nb3);
-                            break;
-                        } else if (src0->type == GGML_TYPE_F16
+                        if (src0->type == GGML_TYPE_F16
                                 && src1->type == GGML_TYPE_F32) {
                             ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb10, nb11, nb12, nb13, nb2, nb3);
                             break;

From 6be93e607149e94550c1ba2fa273cdbaa64f2815 Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Wed, 5 Jul 2023 13:28:40 +0200
Subject: [PATCH 40/43] Ported mat mul from Metal

---
 ggml-vulkan.cpp | 188 ++++++++++++------------------------------------
 1 file changed, 47 insertions(+), 141 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 517b98135f588..5f1b8d43a753e 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -173,7 +173,7 @@ static std::vector<uint32_t> glsl_compile_source(const std::string& source, cons
     std::ofstream fileOut("tmp_kp_shader.comp");
     fileOut << source;
     fileOut.close();
-    if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv > /dev/null").c_str()))
+    if (system("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv > /dev/null"))
         throw std::runtime_error("Error running glslangValidator command");
     std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
     std::vector<char> buffer;
@@ -883,131 +883,59 @@ void ggml_vk_diag_mask_inf(kp::Sequence& seq,
 
 static const std::string program_mul_mat_f16 =
     MULTILINE_QUOTE(
-layout(local_size_x = (BM * BN) / (TM * TN), local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 64) in;
 
 layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float16_t inB[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
 layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 
 layout (push_constant) uniform parameter {
-    int M;
-    int N;
-    int K;
-    int inAStride;
-    int inBStride;
-    int outStride;
+    int64_t ne00;
+    int64_t ne01;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    int64_t ne10;
+    int64_t ne11;
+    uint64_t nb10;
+    uint64_t nb11;
+    uint64_t nb12;
+    int64_t ne0;
+    int64_t ne1;
     uint inAOff;
     uint inBOff;
     uint outOff;
 } pcs;
 
-shared float16_t bufA[BM * (BK+1)];
-shared float16_t bufB[BN * (BK+1)];
+shared float sum[gl_WorkGroupSize.x];
 
 void main() {
-    const int ir = int(gl_WorkGroupID.x);
-    const int ic = int(gl_WorkGroupID.y);
-
-    const int rstride = BM / TM;
-
-    const int lr = int(gl_LocalInvocationID.x % rstride);
-    const int lc = int(gl_LocalInvocationID.x / rstride);
-
-    const int loadr = int(gl_LocalInvocationID.x % BK);
-    const int loadc = int(gl_LocalInvocationID.x / BK);
+    const int64_t r0 = gl_GlobalInvocationID.x;
+    const int64_t r1 = gl_GlobalInvocationID.y;
+    const int64_t im = gl_GlobalInvocationID.z;
 
-    const int loadstride = int(gl_WorkGroupSize.x);
+    const uint x = uint((r0*pcs.nb01 + im*pcs.nb02) / 2); // Based from inA
+    const uint y = uint((r1*pcs.nb11 + im*pcs.nb12) / 4); // based from inB
 
-    int posA = ir * BM * pcs.inAStride;
-    int posB = ic * BN * pcs.inBStride;
+    sum[gl_WorkGroupID.x] = 0.0f;
 
-    float sums[TM * TN];
-    float16_t cacheA[TM];
-    float16_t cacheB[TN];
-
-    [[unroll]] for (int i = 0; i < TM*TN; i++) {
-        sums[i] = 0.0hf;
+    for (uint i = gl_WorkGroupID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
+        sum[gl_WorkGroupID.x] += float(inA[x+i]) * float(inB[y+i]);
     }
 
-    [[unroll]] for (int block = 0; block < pcs.K; block += BK) {
-        [[unroll]] for (int l = 0; l < BM * BK; l += loadstride) {
-            const int lr = l % BK;
-            const int lc = l / BK;
-            bufA[(loadc + lc) * (BK+1) + loadr + lr] = inA[posA + (loadc + lc) * pcs.inAStride + loadr + lr];
-        }
-        [[unroll]] for (int l = 0; l < BN * BK; l += loadstride) {
-            const int lr = l % BK;
-            const int lc = l / BK;
-            bufB[(loadc + lc) * (BK+1) + loadr + lr] = inB[posB + (loadc + lc) * pcs.inBStride + loadr + lr];
-        }
-
-        barrier();
-
-        posA += BK;
-        posB += BK;
-
-        [[unroll]] for (int i = 0; i < BK; i++) {
-            // Load from shared into cache
-            [[unroll]] for (int j = 0; j < BM; j++) {
-                cacheA[j] = bufA[(lr + j*rstride) * (BK+1) + i];
-            }
-            [[unroll]] for (int j = 0; j < TN; j++) {
-                cacheB[j] = bufB[(lc * TN + j) * (BK+1) + i];
-            }
-
-            [[unroll]] for (int cc = 0; cc < TN; cc++) {
-                [[unroll]] for (int cr = 0; cr < TM; cr++) {
-                    sums[cc * TM + cr] += float(cacheA[cr]) * float(cacheB[cc]);
-                }
-            }
+    // accumulate the sum from all threads in the threadgroup
+    barrier();
+    memoryBarrierShared();
+    for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
+        if (gl_WorkGroupID.x < i) {
+            sum[gl_WorkGroupID.x] += sum[gl_WorkGroupID.x + i];
         }
-
         barrier();
+        memoryBarrierShared();
     }
 
-    const int dr = ir * BM + lr;
-    const int dc = ic * BN + lc * TN;
-
-    [[unroll]] for (int cc = 0; cc < TN; cc++) {
-        [[unroll]] for (int cr = 0; cr < TM; cr++) {
-            out_[(dc + cc) * pcs.outStride + dr + cr*rstride] = sums[cc * TM + cr];
-        }
-    }
-}
-);
-
-static const std::string program_fast_mul_mat_f16 =
-    MULTILINE_QUOTE(
-layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { float16_t inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
-    int M;
-    int N;
-    int K;
-    int inAStride;
-    int inBStride;
-    int outStride;
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-} pcs;
-
-void main() {
-    int row = int(gl_GlobalInvocationID.x);
-    int col = int(gl_GlobalInvocationID.y);
-
-    if (row < pcs.M && col < pcs.N) {
-        float sum = 0.0f;
-
-        for (int i = 0; i < pcs.K; i++) {
-            sum += float(inA[row * pcs.inAStride + i]) * float(inB[col * pcs.inBStride + i]);
-        }
-
-        out_[col * pcs.outStride + row] = sum;
+    if (gl_WorkGroupID.x == 0) {
+        out_[uint(im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0)] = sum[0];
     }
 }
 );
@@ -1016,48 +944,26 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
                          const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
                          const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
-                         int64_t ne00, int64_t ne01, int64_t ne02, uint64_t ne03,
-                         int64_t ne10, int64_t ne11,
-                         int nb10, int nb11, int nb12, int nb13,
-                         int nb2, int nb3) {
-    const static auto spirv = glsl_compile_source(program_source_head+program_fast_mul_mat_f16, __func__);
-
-    const bool inB_cont_rows = nb10 == sizeof(float);
-    const bool inB_cont_cols = (size_t)nb11 == ne11 * sizeof(float);
+                         int64_t ne00, int64_t ne01,
+                         uint64_t nb00, uint64_t nb01, uint64_t nb02,
+                         int64_t ne10, int64_t ne11, int64_t ne12,
+                         uint64_t nb10, uint64_t nb11, uint64_t nb12,
+                         int64_t ne0, int64_t ne1) {
+    const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f16, __func__);
 
     struct PushConstants {
-        int32_t M, N, K, inAStride, inBStride, outStride;
+        int64_t ne00, ne01;
+        uint64_t nb00, nb01, nb02;
+        int64_t ne10, ne11;
+        uint64_t nb10, nb11, nb12;
+        int64_t ne0, ne1;
         uint32_t inAOff, inBOff, outOff;
     } pushConsts {
-        (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01,
+        ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, ne0, ne1,
         inAOff, inBOff, outOff
     };
 
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            auto tmp = mgr.tensorT<half>(std::vector<half>(ne10*ne11));
-
-            if (inB_cont_rows) {
-                if (inB_cont_cols) {
-                    ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12)/sizeof(float), tmp, 0, ne10*ne11);
-                }
-                else {
-                    for (int64_t i01 = 0; i01 < ne11; i01++) {
-                        ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12 + i01*nb11)/sizeof(float), tmp, i01*ne10, ne10);
-                    }
-                }
-            } else {
-                for (int64_t i01 = 0; i01 < ne11; i01++) {
-                    for (int64_t i00 = 0; i00 < ne10; i00++) {
-                        // Extremely slow because of single shader invocation
-                        ggml_vk_fp32_to_fp16_row(seq, inB, (i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10)/sizeof(float), tmp, i01*ne10 + i00, 1);
-                    }
-                }
-            }
-
-            seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, tmp, out}, spirv, {uint32_t(ne01/128), uint32_t(ne11/128)}, {}, {pushConsts}));
-        }
-    }
+    seq.record<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts}));
 }
 
 
@@ -1179,7 +1085,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         if (src0->type == GGML_TYPE_F16
                                 && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02, ne03, ne10, ne11, nb10, nb11, nb12, nb13, nb2, nb3);
+                            ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1);
                             break;
                         }
                     }

From 856b7589e9661507ff256b401f93c95da3173f2e Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Wed, 5 Jul 2023 13:34:01 +0200
Subject: [PATCH 41/43] Optimized ggml_vk_mul_mat_f16 argument count

---
 ggml-vulkan.cpp | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 5f1b8d43a753e..6aab3ddaed0d4 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -891,13 +891,8 @@ layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
 
 layout (push_constant) uniform parameter {
     int64_t ne00;
-    int64_t ne01;
-    uint64_t nb00;
     uint64_t nb01;
     uint64_t nb02;
-    int64_t ne10;
-    int64_t ne11;
-    uint64_t nb10;
     uint64_t nb11;
     uint64_t nb12;
     int64_t ne0;
@@ -945,21 +940,20 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
                          const std::shared_ptr<kp::Tensor>& out, uint32_t outOff,
                          int64_t ne00, int64_t ne01,
-                         uint64_t nb00, uint64_t nb01, uint64_t nb02,
-                         int64_t ne10, int64_t ne11, int64_t ne12,
-                         uint64_t nb10, uint64_t nb11, uint64_t nb12,
+                         uint64_t nb01, uint64_t nb02,
+                         int64_t ne11, int64_t ne12,
+                         uint64_t nb11, uint64_t nb12,
                          int64_t ne0, int64_t ne1) {
     const static auto spirv = glsl_compile_source(program_source_head+program_mul_mat_f16, __func__);
 
     struct PushConstants {
-        int64_t ne00, ne01;
-        uint64_t nb00, nb01, nb02;
-        int64_t ne10, ne11;
-        uint64_t nb10, nb11, nb12;
+        int64_t ne00;
+        uint64_t nb01, nb02;
+        uint64_t nb11, nb12;
         int64_t ne0, ne1;
         uint32_t inAOff, inBOff, outOff;
     } pushConsts {
-        ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, ne0, ne1,
+        ne00, nb01, nb02, nb11, nb12, ne0, ne1,
         inAOff, inBOff, outOff
     };
 
@@ -1085,7 +1079,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         if (src0->type == GGML_TYPE_F16
                                 && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, nb00, nb01, nb02, ne10, ne11, ne12, nb10, nb11, nb12, ne0, ne1);
+                            ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                             break;
                         }
                     }

From 77ebe46966b9173706b9e58df2ec02711003aced Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Wed, 5 Jul 2023 14:21:16 +0200
Subject: [PATCH 42/43] Fixed case order in ggml_vk_graph_compute

---
 ggml-vulkan.cpp | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 6aab3ddaed0d4..232109762d937 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1075,21 +1075,25 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         ggml_vk_diag_mask_inf(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, ne02);
                     } break;
+                case GGML_OP_NORM:
+                    {
+                        ggml_vk_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0));
+                    } break;
+                case GGML_OP_RMS_NORM:
+                    {
+                    ggml_vk_rms_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0));
+                } break;
                 case GGML_OP_MUL_MAT:
                     {
                         if (src0->type == GGML_TYPE_F16
-                                && src1->type == GGML_TYPE_F32) {
+                            && src1->type == GGML_TYPE_F32) {
                             ggml_vk_mul_mat_f16(seq, id_src0, offs_src0, id_src1, offs_src1, id_dst, offs_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                             break;
+                        } else {
+                            printf("Unsupported quantization: %u/%u\n", src0->type, src1->type);
                         }
                     }
-                case GGML_OP_NORM: {
-                        ggml_vk_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0));
-                    } break;
-                case GGML_OP_RMS_NORM: {
-                    ggml_vk_rms_norm(seq, id_src0, offs_src0, id_dst, offs_dst, ne00, ne01, ggml_nrows(src0));
-                } break;
-                default:
+                default: {}
                     fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                     //GGML_ASSERT(false);
             }

From 44d214c04034b52e098f0df595341b50cab5248e Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Wed, 5 Jul 2023 14:34:18 +0200
Subject: [PATCH 43/43] Only warn if __STDC_IEC_559__ isn't defined

---
 ggml-vulkan.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 232109762d937..06c0434608ce0 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -16,7 +16,7 @@
 #include <kompute/Kompute.hpp>
 
 #ifndef __STDC_IEC_559__
-#error Your C implementation is not IEC 559 compliant, which is required for proper Vulkan interop.
+#warning Your C implementation is not IEC 559 compliant, which is required for proper Vulkan interop.
 #endif
 
 #define MULTILINE_QUOTE(...) #__VA_ARGS__