Skip to content

Commit b320f91

Browse files
committed
Use sentencepiece tokenization
1 parent 49064d4 commit b320f91

File tree

6 files changed

+39
-17
lines changed

6 files changed

+39
-17
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ models/*
2121

2222
arm_neon.h
2323
compile_commands.json
24+
deps

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ endif
3131
#
3232

3333
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
34-
CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC
34+
CXXFLAGS = -I. -Ideps/sentencepiece-0.1.97/src/ -O3 -DNDEBUG -std=c++11 -fPIC
3535
LDFLAGS =
3636
# OS specific
3737
# TODO: support Windows
@@ -187,7 +187,7 @@ clean:
187187
rm -f *.o main quantize
188188

189189
main: main.cpp ggml.o utils.o
190-
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS)
190+
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o deps/libsentencepiece.a -o main $(LDFLAGS)
191191
./main -h
192192

193193
quantize: quantize.cpp ggml.o utils.o

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ Here are the step for the LLaMA-7B model:
132132
# build this repo
133133
git clone https://github.com/ggerganov/llama.cpp
134134
cd llama.cpp
135-
make
135+
./build.sh
136136

137137
# obtain the original LLaMA model weights and place them in ./models
138138
ls ./models

build.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/sh
2+
3+
if [ ! -d deps ]
4+
then
5+
mkdir deps
6+
fi
7+
cd deps
8+
if [ ! -f v0.1.97.tar.gz ]
9+
then
10+
curl -LO https://github.com/google/sentencepiece/archive/refs/tags/v0.1.97.tar.gz
11+
fi
12+
if [ ! -f libsentencepiece.a ]
13+
then
14+
tar xzvf v0.1.97.tar.gz
15+
cd sentencepiece-0.1.97/ && rm -rf build && mkdir build && cd build && cmake ..
16+
make sentencepiece-static -j $(nproc)
17+
cd ../..
18+
cp sentencepiece-0.1.97/build/src/libsentencepiece.a ./
19+
fi
20+
cd ..
21+
make

build_deps.sh

Lines changed: 0 additions & 12 deletions
This file was deleted.

main.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -855,6 +855,8 @@ int main(int argc, char ** argv) {
855855
printf("\n\n");
856856

857857
std::vector<gpt_vocab::id> embd;
858+
std::vector<gpt_vocab::id> all_tokens;
859+
std::string full_text = "";
858860

859861
// determine the required inference memory per token:
860862
size_t mem_per_token = 0;
@@ -919,6 +921,7 @@ int main(int argc, char ** argv) {
919921

920922
last_n_tokens.erase(last_n_tokens.begin());
921923
last_n_tokens.push_back(id);
924+
all_tokens.push_back(id);
922925

923926
t_sample_us += ggml_time_us() - t_start_sample_us;
924927
}
@@ -937,6 +940,7 @@ int main(int argc, char ** argv) {
937940
embd.push_back(embd_inp[input_consumed]);
938941
last_n_tokens.erase(last_n_tokens.begin());
939942
last_n_tokens.push_back(embd_inp[input_consumed]);
943+
all_tokens.push_back(embd_inp[input_consumed]);
940944
++input_consumed;
941945
if (embd.size() > params.n_batch) {
942946
break;
@@ -949,8 +953,16 @@ int main(int argc, char ** argv) {
949953
}
950954

951955
// display text
952-
std::string text;
953-
processor.Decode(all_tokens, &text);
956+
std::string check = processor.IdToPiece(all_tokens.at(all_tokens.size()-1));
957+
if(check != "") { // ensure a multi-byte token is finished generating before outputting the text
958+
std::string text;
959+
processor.Decode(all_tokens, &text);
960+
std::string chunk = text.substr(full_text.length());
961+
printf("%s", chunk.c_str());
962+
full_text += chunk;
963+
fflush(stdout);
964+
}
965+
954966

955967
// in interactive mode, and not currently processing queued inputs;
956968
// check if we should prompt the user for more

0 commit comments

Comments
 (0)