Skip to content

Commit 9bfb2fd

Browse files
committed
Merge branch 'concedo_experimental'
2 parents b379f9d + 66328fc commit 9bfb2fd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+3111
-1539
lines changed

.gitignore

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,20 @@ qnt-*.txt
6464
perf-*.txt
6565

6666
examples/jeopardy/results.txt
67+
68+
pyproject.toml
69+
poetry.lock
70+
poetry.toml
71+
72+
# Test binaries
73+
tests/test-double-float
74+
tests/test-grad0
75+
tests/test-opt
76+
tests/test-quantize-fns
77+
tests/test-quantize-perf
78+
tests/test-sampling
79+
tests/test-tokenizer-0
80+
6781
koboldcpp.so
6882
koboldcpp_failsafe.so
6983
koboldcpp_openblas.so

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ if (LLAMA_CUBLAS)
100100
if (LLAMA_CUDA_DMMV_F16)
101101
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
102102
else()
103-
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
103+
set(CMAKE_CUDA_ARCHITECTURES "37;86") # lowest CUDA 12 standard + lowest for integer intrinsics
104104
endif()
105105
endif()
106106
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,8 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
374374
$(CXX) $(CXXFLAGS) -c $< -o $@
375375
common.o: examples/common.cpp examples/common.h
376376
$(CXX) $(CXXFLAGS) -c $< -o $@
377+
grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
378+
$(CXX) $(CXXFLAGS) -c $< -o $@
377379
expose.o: expose.cpp expose.h
378380
$(CXX) $(CXXFLAGS) -c $< -o $@
379381
gpttype_adapter_failsafe.o: gpttype_adapter.cpp
@@ -388,7 +390,7 @@ gpttype_adapter_cublas.o: gpttype_adapter.cpp
388390
clean:
389391
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_openblas_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so
390392

391-
main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o $(OBJS)
393+
main: examples/main/main.cpp build-info.h ggml.o k_quants.o llama.o common.o grammar-parser.o $(OBJS)
392394
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
393395
@echo
394396
@echo '==== Run ./main -h for help. ===='

ci/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,10 @@ It is a good practice, before publishing changes to execute the full CI locally
1616

1717
```bash
1818
mkdir tmp
19+
20+
# CPU-only build
1921
bash ./ci/run.sh ./tmp/results ./tmp/mnt
22+
23+
# with CUDA support
24+
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
2025
```

ci/run.sh

Lines changed: 164 additions & 17 deletions
Large diffs are not rendered by default.

convert.py

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,9 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
142142
@dataclass
143143
class Params:
144144
n_vocab: int
145-
n_embd: int
146-
n_mult: int
147-
n_head: int
145+
n_embd: int
146+
n_mult: int
147+
n_head: int
148148
n_layer: int
149149

150150
@staticmethod
@@ -167,40 +167,65 @@ def guessed(model: 'LazyModel') -> 'Params':
167167
n_head=n_embd // 128 # guessed
168168

169169
return Params(
170-
n_vocab=n_vocab,
171-
n_embd=n_embd,
172-
n_mult=256,
173-
n_head=n_head,
174-
n_layer=n_layer,
170+
n_vocab = n_vocab,
171+
n_embd = n_embd,
172+
n_mult = 256,
173+
n_head = n_head,
174+
n_layer = n_layer,
175175
)
176176

177177
@staticmethod
178178
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
179179
config = json.load(open(config_path))
180180

181181
n_vocab = config["vocab_size"];
182-
n_embd = config["hidden_size"];
183-
n_head = config["num_attention_heads"];
182+
n_embd = config["hidden_size"];
183+
n_head = config["num_attention_heads"];
184184
n_layer = config["num_hidden_layers"];
185-
n_ff = config["intermediate_size"];
185+
n_ff = config["intermediate_size"];
186186

187187
n_mult = find_n_mult(n_ff, n_embd);
188188

189189
return Params(
190-
n_vocab=n_vocab,
191-
n_embd=n_embd,
192-
n_mult=n_mult,
193-
n_head=n_head,
194-
n_layer=n_layer,
190+
n_vocab = n_vocab,
191+
n_embd = n_embd,
192+
n_mult = n_mult,
193+
n_head = n_head,
194+
n_layer = n_layer,
195+
)
196+
197+
# LLaMA v2 70B params.json
198+
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
199+
@staticmethod
200+
def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
201+
config = json.load(open(config_path))
202+
203+
n_vocab = config["vocab_size"];
204+
n_embd = config["dim"];
205+
n_head = config["n_heads"];
206+
n_layer = config["n_layers"];
207+
n_mult = config["multiple_of"];
208+
209+
if n_vocab == -1:
210+
n_vocab = model["tok_embeddings.weight"].shape[0]
211+
212+
return Params(
213+
n_vocab = n_vocab,
214+
n_embd = n_embd,
215+
n_mult = n_mult,
216+
n_head = n_head,
217+
n_layer = n_layer,
195218
)
196219

197220
@staticmethod
198221
def load(model_plus: 'ModelPlus') -> 'Params':
222+
hf_config_path = model_plus.paths[0].parent / "config.json"
199223
orig_config_path = model_plus.paths[0].parent / "params.json"
200-
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
201224

202-
if hf_transformer_config_path.exists():
203-
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
225+
if hf_config_path.exists():
226+
params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
227+
elif orig_config_path.exists():
228+
params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
204229
else:
205230
params = Params.guessed(model_plus.model)
206231

@@ -1036,8 +1061,7 @@ def write_vocab(self, vocab: Vocab) -> None:
10361061
@staticmethod
10371062
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
10381063
of = OutputFile(fname_out)
1039-
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
1040-
n_head=1, n_layer=0)
1064+
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
10411065
of = OutputFile(fname_out)
10421066
of.write_file_header(params, file_type=GGMLFileType.AllF32)
10431067
of.write_vocab(vocab)

examples/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ set(TARGET common)
1313
add_library(${TARGET} OBJECT
1414
common.h
1515
common.cpp
16+
grammar-parser.h
17+
grammar-parser.cpp
1618
)
1719

1820
if (BUILD_SHARED_LIBS)

examples/Miku.sh

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,38 +2,39 @@
22
set -e
33

44
AI_NAME="${AI_NAME:-Miku}"
5-
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
5+
MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
66
USER_NAME="${USER_NAME:-Anon}"
77

88
# Uncomment and adjust to the number of CPU cores you want to use.
99
#N_THREAD="${N_THREAD:-4}"
10+
CTX_SIZE="${CTX_SIZE:-4096}"
1011
N_PREDICTS="${N_PREDICTS:-4096}"
1112

1213
GEN_OPTIONS=(--batch_size 1024
13-
--ctx_size 2048
14+
--ctx_size "$CTX_SIZE"
1415
--keep -1
1516
--repeat_last_n 256
1617
--repeat_penalty 1.17647
17-
--temp 0.7
18-
--top_k 40
19-
--top_p 0.5)
18+
--temp 0.6
19+
--mirostat 2)
2020

2121
if [ -n "$N_THREAD" ]; then
2222
GEN_OPTIONS+=(--threads "$N_THREAD")
2323
fi
2424

2525
./main "${GEN_OPTIONS[@]}" \
2626
--model "$MODEL" \
27+
--in-prefix " " \
28+
--in-suffix "${AI_NAME}:" \
2729
--n_predict "$N_PREDICTS" \
2830
--color --interactive \
2931
--reverse-prompt "${USER_NAME}:" \
30-
--prompt "
31-
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
32+
--prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
3233
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
3334
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
3435
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
3536
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
36-
The conversation is only between ${USER_NAME} and ${AI_NAME}
37+
The conversation is only between ${USER_NAME} and ${AI_NAME}.
3738
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
3839
${AI_NAME} can only communicate through text, so she can't send images or videos.
3940

0 commit comments

Comments
 (0)