Skip to content

Commit 540f4e0

Browse files
committed
Merge remote-tracking branch 'upstream/concedo'
2 parents 2c3b46f + eda663f commit 540f4e0

File tree

16 files changed

+78
-36
lines changed

16 files changed

+78
-36
lines changed

CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
4646
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
4747
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
4848
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
49-
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
49+
option(LLAMA_HIPBLAS "llama: use hipBLAS" ON)
5050
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
5151

5252

@@ -339,4 +339,3 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
339339
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
340340
target_link_libraries(${TARGET} PUBLIC ggml ggml_v1 ggml_v2 common2 gpttype_adapter ${CMAKE_THREAD_LIBS_INIT})
341341
target_compile_features(${TARGET} PRIVATE cxx_std_11)
342-

Makefile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ endif
4242

4343
# keep standard at C11 and C++11
4444
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11 -fPIC -DGGML_USE_K_QUANTS
45-
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC
45+
CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
4646
LDFLAGS =
4747

4848
# these are used on windows, to build some libraries with extra old device compatibility
@@ -53,7 +53,11 @@ NONECFLAGS =
5353
OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
5454
CLBLAST_FLAGS = -DGGML_USE_CLBLAST
5555
FAILSAFE_FLAGS = -DUSE_FAILSAFE
56-
CUBLAS_FLAGS = -DGGML_USE_CUBLAS
56+
ifdef LLAMA_CUBLAS
57+
CUBLAS_FLAGS = -DGGML_USE_CUBLAS
58+
else
59+
CUBLAS_FLAGS =
60+
endif
5761
CUBLASLD_FLAGS =
5862
CUBLAS_OBJS =
5963

examples/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
110110
invalid_param = true;
111111
break;
112112
}
113-
params.seed = std::stoi(argv[i]);
113+
params.seed = std::stoul(argv[i]);
114114
} else if (arg == "-t" || arg == "--threads") {
115115
if (++i >= argc) {
116116
invalid_param = true;

examples/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
int32_t get_num_physical_cores();
2323

2424
struct gpt_params {
25-
int32_t seed = -1; // RNG seed
25+
uint32_t seed = -1; // RNG seed
2626
int32_t n_threads = get_num_physical_cores();
2727
int32_t n_predict = -1; // new tokens to predict
2828
int32_t n_ctx = 512; // context size

examples/embedding/embedding.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ int main(int argc, char ** argv) {
2424

2525
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
2626

27-
if (params.seed < 0) {
27+
if (params.seed == LLAMA_DEFAULT_SEED) {
2828
params.seed = time(NULL);
2929
}
3030

31-
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
31+
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
3232

3333
std::mt19937 rng(params.seed);
3434
if (params.random_prompt) {

examples/main/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ Example usage: `--logit-bias 29905-inf`
242242

243243
### RNG Seed
244244

245-
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
245+
- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
246246

247247
The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
248248

examples/main/main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,11 @@ int main(int argc, char ** argv) {
9494

9595
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
9696

97-
if (params.seed < 0) {
97+
if (params.seed == LLAMA_DEFAULT_SEED) {
9898
params.seed = time(NULL);
9999
}
100100

101-
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
101+
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
102102

103103
std::mt19937 rng(params.seed);
104104
if (params.random_prompt) {

examples/perplexity/perplexity.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,11 @@ int main(int argc, char ** argv) {
136136

137137
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
138138

139-
if (params.seed < 0) {
139+
if (params.seed == LLAMA_DEFAULT_SEED) {
140140
params.seed = time(NULL);
141141
}
142142

143-
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
143+
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
144144

145145
std::mt19937 rng(params.seed);
146146
if (params.random_prompt) {

examples/server/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ node .
152152

153153
`mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).
154154

155-
`seed`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
155+
`seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
156156

157157
`ignore_eos`: Ignore end of stream token and continue generating (default: false).
158158

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2768,7 +2768,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
27682768
fprintf(stderr, " --checkpoint-in FNAME path from which to load training checkpoint (default '%s')\n", params->fn_checkpoint_in);
27692769
fprintf(stderr, " --checkpoint-out FNAME path to save training checkpoint (default '%s')\n", params->fn_checkpoint_out);
27702770
fprintf(stderr, " --model-out FNAME path to save ggml model (default '%s')\n", params->fn_model_out);
2771-
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
2771+
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n");
27722772
fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx);
27732773
fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd);
27742774
fprintf(stderr, " --mult N Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult);
@@ -3034,10 +3034,10 @@ int main(int argc, char ** argv) {
30343034
return 1;
30353035
}
30363036

3037-
if (params.seed < 0) {
3037+
if (params.seed == LLAMA_DEFAULT_SEED) {
30383038
params.seed = time(NULL);
30393039
}
3040-
printf("%s: seed: %d\n", __func__, params.seed);
3040+
printf("%s: seed: %u\n", __func__, params.seed);
30413041
srand(params.seed);
30423042

30433043
struct llama_context_params llama_params = llama_context_default_params();

klite.embd

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

koboldcpp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ def utfprint(str):
236236
maxhordelen = 256
237237
modelbusy = False
238238
defaultport = 5001
239-
KcppVersion = "1.33"
239+
KcppVersion = "1.34"
240240
showdebug = True
241241

242242
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):

llama.cpp

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
#include "ggml.h"
1313
#ifdef GGML_USE_CUBLAS
1414
#include "ggml-cuda.h"
15-
#elif defined(GGML_USE_CLBLAST)
15+
#endif
16+
#if defined(GGML_USE_CLBLAST)
1617
#include "ggml-opencl.h"
1718
#endif
1819

@@ -66,6 +67,7 @@ enum e_model {
6667
MODEL_65B,
6768
};
6869

70+
static const size_t kB = 1024;
6971
static const size_t MB = 1024*1024;
7072

7173
// computed for n_ctx == 2048
@@ -129,6 +131,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
129131
return k_sizes;
130132
}
131133

134+
// amount of VRAM needed per batch size to hold temporary results
135+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
136+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
137+
{
138+
static std::map<e_model, size_t> k_sizes = {
139+
{ MODEL_3B, 512ull * kB },
140+
{ MODEL_7B, 512ull * kB },
141+
{ MODEL_13B, 640ull * kB },
142+
{ MODEL_30B, 768ull * kB },
143+
{ MODEL_65B, 1536ull * kB },
144+
};
145+
return k_sizes;
146+
}
147+
148+
// amount of VRAM needed per batch size and context to hold temporary results
149+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
150+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
151+
{
152+
static std::map<e_model, size_t> k_sizes = {
153+
{ MODEL_3B, 128ull },
154+
{ MODEL_7B, 128ull },
155+
{ MODEL_13B, 160ull },
156+
{ MODEL_30B, 208ull },
157+
{ MODEL_65B, 416ull },
158+
};
159+
return k_sizes;
160+
}
161+
132162
// default hparams (LLaMA 7B)
133163
struct llama_hparams {
134164
uint32_t n_vocab = 32000;
@@ -777,7 +807,7 @@ static bool kv_cache_init(
777807

778808
struct llama_context_params llama_context_default_params() {
779809
struct llama_context_params result = {
780-
/*.seed =*/ -1,
810+
/*.seed =*/ LLAMA_DEFAULT_SEED,
781811
/*.n_ctx =*/ 512,
782812
/*.n_batch =*/ 512,
783813
/*.gpu_layers =*/ 0,
@@ -1113,11 +1143,14 @@ static void llama_model_load_internal(
11131143
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
11141144
ggml_cuda_set_scratch_size(0); // disable scratch
11151145
} else {
1116-
vram_scratch = n_batch * MB;
1146+
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
1147+
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
1148+
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
11171149
ggml_cuda_set_scratch_size(vram_scratch);
11181150
if (n_gpu_layers > 0) {
1119-
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1120-
__func__, vram_scratch / MB);
1151+
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1152+
__func__, vram_scratch_base / kB, vram_scratch_per_context,
1153+
(vram_scratch + MB - 1) / MB); // round up
11211154
}
11221155
}
11231156
#endif // GGML_USE_CUBLAS
@@ -2540,7 +2573,7 @@ struct llama_context * llama_new_context_with_model(
25402573

25412574
llama_context * ctx = new llama_context(*model, model->vocab);
25422575

2543-
if (params.seed < 0) {
2576+
if (params.seed == LLAMA_DEFAULT_SEED) {
25442577
params.seed = time(NULL);
25452578
}
25462579

@@ -2974,8 +3007,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
29743007

29753008
#define LLAMA_MAX_RNG_STATE (64*1024)
29763009

2977-
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
2978-
if (seed < 0) {
3010+
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
3011+
if (seed == LLAMA_DEFAULT_SEED) {
29793012
seed = time(NULL);
29803013
}
29813014
ctx->rng.seed(seed);

llama.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
4747
#define LLAMA_SESSION_VERSION 1
4848

49+
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
50+
4951
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
5052
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
5153
#define LLAMA_SUPPORTS_GPU_OFFLOAD
@@ -81,11 +83,11 @@ extern "C" {
8183
typedef void (*llama_progress_callback)(float progress, void *ctx);
8284

8385
struct llama_context_params {
84-
int seed; // RNG seed, -1 for random
85-
int n_ctx; // text context
86-
int n_batch; // prompt processing batch size
87-
int n_gpu_layers; // number of layers to store in VRAM
88-
int main_gpu; // the GPU that is used for scratch and small tensors
86+
uint32_t seed; // RNG seed, -1 for random
87+
int32_t n_ctx; // text context
88+
int32_t n_batch; // prompt processing batch size
89+
int32_t n_gpu_layers; // number of layers to store in VRAM
90+
int32_t main_gpu; // the GPU that is used for scratch and small tensors
8991
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
9092
// called with a progress value between 0 and 1, pass NULL to disable
9193
llama_progress_callback progress_callback;
@@ -196,7 +198,7 @@ extern "C" {
196198
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
197199

198200
// Sets the current rng seed.
199-
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
201+
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
200202

201203
// Returns the maximum size in bytes of the state (rng, logits, embedding
202204
// and kv_cache) - will often be smaller after compacting tokens

otherarch/gpt2_v3.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818

1919
#ifdef GGML_USE_CUBLAS
2020
#include "ggml-cuda.h"
21-
#elif defined(GGML_USE_CLBLAST)
21+
#endif
22+
#if defined(GGML_USE_CLBLAST)
2223
#include "ggml-opencl.h"
2324
#endif
2425

26+
2527
// load the model's weights from a file
2628
ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
2729
printf("%s: loading model from '%s'\n", __func__, fname.c_str());

otherarch/llama_v2.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99
#include "llama_v2.h"
1010

1111
#include "ggml_v2.h"
12+
1213
#ifdef GGML_USE_CUBLAS
1314
#include "ggml_v2-cuda.h"
1415
#endif
1516
#if defined(GGML_USE_CLBLAST)
1617
#include "ggml_v2-opencl.h"
1718
#endif
1819

20+
1921
#include <array>
2022
#include <ctime>
2123
#include <cinttypes>

0 commit comments

Comments
 (0)