Skip to content

Commit beebdfd

Browse files
committed
Merge branch 'master' into gg/llama-refactor-sampling
2 parents a880be2 + ecf6b7f commit beebdfd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+2783
-1920
lines changed

.devops/nix/package.nix

+3-10
Original file line numberDiff line numberDiff line change
@@ -126,16 +126,9 @@ let
126126
++ optionals useMetalKit [ MetalKit ];
127127

128128
cudaBuildInputs = with cudaPackages; [
129-
cuda_cccl.dev # <nv/target>
130-
131-
# A temporary hack for reducing the closure size, remove once cudaPackages
132-
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
133-
cuda_cudart.dev
134-
cuda_cudart.lib
135-
cuda_cudart.static
136-
libcublas.dev
137-
libcublas.lib
138-
libcublas.static
129+
cuda_cudart
130+
cuda_cccl # <nv/target>
131+
libcublas
139132
];
140133

141134
rocmBuildInputs = with rocmPackages; [

.github/workflows/build.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -860,7 +860,8 @@ jobs:
860860
mkdir build
861861
cd build
862862
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
863-
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
863+
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
864+
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
864865
865866
- name: Determine tag name
866867
id: tag

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ build*
5050
!docs/build.md
5151
/libllama.so
5252
/llama-*
53+
/vulkan-shaders-gen
5354
android-ndk-*
5455
arm_neon.h
5556
cmake-build-*

CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,8 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
139139
# determining _precisely_ which defines are necessary for the llama-config
140140
# package.
141141
#
142-
get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
142+
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
143+
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
143144
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
144145
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
145146
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

Makefile

+63-27
Original file line numberDiff line numberDiff line change
@@ -325,9 +325,9 @@ ifdef LLAMA_DEBUG
325325
endif
326326
else
327327
MK_CPPFLAGS += -DNDEBUG
328-
MK_CFLAGS += -O3
329-
MK_CXXFLAGS += -O3
330-
MK_NVCCFLAGS += -O3
328+
MK_CFLAGS += -O3 -g
329+
MK_CXXFLAGS += -O3 -g
330+
MK_NVCCFLAGS += -O3 -g
331331
endif
332332

333333
ifdef LLAMA_SANITIZE_THREAD
@@ -528,10 +528,21 @@ ifndef GGML_NO_ACCELERATE
528528
endif
529529
endif # GGML_NO_ACCELERATE
530530

531+
ifdef GGML_MUSA
532+
CC := clang
533+
CXX := clang++
534+
GGML_CUDA := 1
535+
MK_CPPFLAGS += -DGGML_USE_MUSA
536+
endif
537+
531538
ifndef GGML_NO_OPENMP
532539
MK_CPPFLAGS += -DGGML_USE_OPENMP
533540
MK_CFLAGS += -fopenmp
534541
MK_CXXFLAGS += -fopenmp
542+
ifdef GGML_MUSA
543+
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
544+
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
545+
endif # GGML_MUSA
535546
endif # GGML_NO_OPENMP
536547

537548
ifdef GGML_OPENBLAS
@@ -582,15 +593,27 @@ else
582593
endif # GGML_CUDA_FA_ALL_QUANTS
583594

584595
ifdef GGML_CUDA
585-
ifneq ('', '$(wildcard /opt/cuda)')
586-
CUDA_PATH ?= /opt/cuda
596+
ifdef GGML_MUSA
597+
ifneq ('', '$(wildcard /opt/musa)')
598+
CUDA_PATH ?= /opt/musa
599+
else
600+
CUDA_PATH ?= /usr/local/musa
601+
endif
602+
603+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
604+
MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
605+
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
587606
else
588-
CUDA_PATH ?= /usr/local/cuda
589-
endif
607+
ifneq ('', '$(wildcard /opt/cuda)')
608+
CUDA_PATH ?= /opt/cuda
609+
else
610+
CUDA_PATH ?= /usr/local/cuda
611+
endif
590612

591-
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
592-
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
593-
MK_NVCCFLAGS += -use_fast_math
613+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
614+
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
615+
MK_NVCCFLAGS += -use_fast_math
616+
endif # GGML_MUSA
594617

595618
OBJ_GGML += ggml/src/ggml-cuda.o
596619
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
@@ -600,9 +623,11 @@ ifdef LLAMA_FATAL_WARNINGS
600623
MK_NVCCFLAGS += -Werror all-warnings
601624
endif # LLAMA_FATAL_WARNINGS
602625

626+
ifndef GGML_MUSA
603627
ifndef JETSON_EOL_MODULE_DETECT
604628
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
605629
endif # JETSON_EOL_MODULE_DETECT
630+
endif # GGML_MUSA
606631

607632
ifdef LLAMA_DEBUG
608633
MK_NVCCFLAGS += -lineinfo
@@ -615,8 +640,12 @@ endif # GGML_CUDA_DEBUG
615640
ifdef GGML_CUDA_NVCC
616641
NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
617642
else
618-
NVCC = $(CCACHE) nvcc
619-
endif #GGML_CUDA_NVCC
643+
ifdef GGML_MUSA
644+
NVCC = $(CCACHE) mcc
645+
else
646+
NVCC = $(CCACHE) nvcc
647+
endif # GGML_MUSA
648+
endif # GGML_CUDA_NVCC
620649

621650
ifdef CUDA_DOCKER_ARCH
622651
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
@@ -687,9 +716,15 @@ define NVCC_COMPILE
687716
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
688717
endef # NVCC_COMPILE
689718
else
719+
ifdef GGML_MUSA
720+
define NVCC_COMPILE
721+
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
722+
endef # NVCC_COMPILE
723+
else
690724
define NVCC_COMPILE
691725
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
692726
endef # NVCC_COMPILE
727+
endif # GGML_MUSA
693728
endif # JETSON_EOL_MODULE_DETECT
694729

695730
ggml/src/ggml-cuda/%.o: \
@@ -943,6 +978,7 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1))
943978
ifdef GGML_CUDA
944979
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
945980
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
981+
ifndef GGML_MUSA
946982
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
947983

948984
ifndef CUDA_DOCKER_ARCH
@@ -952,6 +988,7 @@ endif # CUDA_POWER_ARCH
952988
endif # CUDA_DOCKER_ARCH
953989

954990
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
991+
endif # GGML_MUSA
955992
endif # GGML_CUDA
956993
$(info )
957994

@@ -1562,42 +1599,41 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
15621599
# Mark legacy binary targets as .PHONY so that they are always checked.
15631600
.PHONY: main quantize perplexity embedding server
15641601

1602+
# Define the object file target
1603+
examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
1604+
$(CXX) $(CXXFLAGS) -c $< -o $@
1605+
15651606
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
15661607
# Eventually we will want to remove these target from building all the time.
1567-
main: examples/deprecation-warning/deprecation-warning.cpp
1568-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1569-
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1608+
main: examples/deprecation-warning/deprecation-warning.o
1609+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
15701610
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
15711611

1572-
server: examples/deprecation-warning/deprecation-warning.cpp
1573-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1574-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1612+
server: examples/deprecation-warning/deprecation-warning.o
1613+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
15751614
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
15761615

1577-
quantize: examples/deprecation-warning/deprecation-warning.cpp
1616+
quantize: examples/deprecation-warning/deprecation-warning.o
15781617
ifneq (,$(wildcard quantize))
1579-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1580-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1618+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
15811619
@echo "#########"
15821620
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
15831621
@echo " Remove the 'quantize' binary to remove this warning."
15841622
@echo "#########"
15851623
endif
15861624

1587-
perplexity: examples/deprecation-warning/deprecation-warning.cpp
1625+
perplexity: examples/deprecation-warning/deprecation-warning.o
15881626
ifneq (,$(wildcard perplexity))
1589-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1590-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1627+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
15911628
@echo "#########"
15921629
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
15931630
@echo " Remove the 'perplexity' binary to remove this warning."
15941631
@echo "#########"
15951632
endif
15961633

1597-
embedding: examples/deprecation-warning/deprecation-warning.cpp
1634+
embedding: examples/deprecation-warning/deprecation-warning.o
15981635
ifneq (,$(wildcard embedding))
1599-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1600-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1636+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16011637
@echo "#########"
16021638
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
16031639
@echo " Remove the 'embedding' binary to remove this warning."

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
409409
| [BLAS](./docs/build.md#blas-build) | All |
410410
| [BLIS](./docs/backend/BLIS.md) | All |
411411
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
412+
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
412413
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
413414
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
414415
| [Vulkan](./docs/build.md#vulkan) | GPU |

common/common.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -1324,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
13241324
else { invalid_param = true; }
13251325
return true;
13261326
}
1327+
if (arg == "--no-warmup") {
1328+
params.warmup = false;
1329+
return true;
1330+
}
13271331
#ifndef LOG_DISABLE_LOGS
13281332
// Parse args for logging parameters
13291333
if (log_param_single_parse(argv[i])) {
@@ -1446,6 +1450,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
14461450
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
14471451
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
14481452
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
1453+
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
14491454
options.push_back({ "server infill",
14501455
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
14511456

@@ -1629,7 +1634,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16291634
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
16301635
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
16311636
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
1632-
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
1637+
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
16331638
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
16341639
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
16351640
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });

convert_hf_to_gguf.py

+29-1
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def prepare_tensors(self):
316316
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
317317
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
318318
data = gguf.quantize_bf16(data)
319-
assert data.dtype == np.int16
319+
assert data.dtype == np.uint16
320320
data_qtype = gguf.GGMLQuantizationType.BF16
321321

322322
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
@@ -1570,6 +1570,34 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
15701570
return [(self.map_tensor_name(name), data_torch)]
15711571

15721572
def prepare_tensors(self):
1573+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1574+
if rope_scaling.get("rope_type", '').lower() == "llama3":
1575+
base = self.hparams.get("rope_theta", 10000.0)
1576+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1577+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1578+
1579+
factor = rope_scaling.get("factor", 8.0)
1580+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1581+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
1582+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
1583+
1584+
low_freq_wavelen = old_context_len / low_freq_factor
1585+
high_freq_wavelen = old_context_len / high_freq_factor
1586+
assert low_freq_wavelen != high_freq_wavelen
1587+
1588+
rope_factors = []
1589+
for freq in freqs:
1590+
wavelen = 2 * math.pi / freq
1591+
if wavelen < high_freq_wavelen:
1592+
rope_factors.append(1)
1593+
elif wavelen > low_freq_wavelen:
1594+
rope_factors.append(factor)
1595+
else:
1596+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1597+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1598+
1599+
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
1600+
15731601
super().prepare_tensors()
15741602

15751603
if self._experts is not None:

docs/build.md

+18-1
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
178178
cmake --build build --config Release
179179
```
180180
181-
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
181+
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
182+
183+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
184+
185+
The following compilation options are also available to tweak performance:
182186
183187
| Option | Legal values | Default | Description |
184188
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -192,6 +196,19 @@ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/c
192196
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
193197
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
194198
199+
### MUSA
200+
201+
- Using `make`:
202+
```bash
203+
make GGML_MUSA=1
204+
```
205+
- Using `CMake`:
206+
207+
```bash
208+
cmake -B build -DGGML_MUSA=ON
209+
cmake --build build --config Release
210+
```
211+
195212
### hipBLAS
196213
197214
This provides BLAS acceleration on HIP-supported AMD GPUs.

examples/baby-llama/baby-llama.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#include "ggml.h"
22
#include "train.h"
33

4-
#include <vector>
54
#include <cassert>
65
#include <cstdlib>
76
#include <cstring>

examples/batched-bench/batched-bench.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
6969
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
7070

7171
// ensure enough sequences are available
72-
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
72+
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
7373

7474
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
7575

examples/eval-callback/eval-callback.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
6262
} else if (type == GGML_TYPE_I8) {
6363
v = (float) *(int8_t *) &data[i];
6464
} else {
65-
GGML_ASSERT(false);
65+
GGML_ABORT("fatal error");
6666
}
6767
printf("%12.4f", v);
6868
sum += v;

examples/imatrix/imatrix.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
127127
}
128128
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
129129
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
130-
exit(1); //GGML_ASSERT(false);
130+
exit(1); //GGML_ABORT("fatal error");
131131
}
132132
if (m_params.verbosity > 1) {
133133
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
@@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
176176
}
177177
else if (e.values.size() != (size_t)src1->ne[0]) {
178178
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
179-
exit(1); //GGML_ASSERT(false);
179+
exit(1); //GGML_ABORT("fatal error");
180180
}
181181
++e.ncall;
182182
if (m_params.verbosity > 1) {

0 commit comments

Comments
 (0)