Use sentencepiece tokenization

beiller · beiller · commit b320f915d0c6 · 2023-03-12T19:28:19.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,4 @@ models/*
 
 arm_neon.h
 compile_commands.json
+deps
diff --git a/Makefile b/Makefile
@@ -31,7 +31,7 @@ endif
 #
 
 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC
+CXXFLAGS = -I. -Ideps/sentencepiece-0.1.97/src/ -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  = 
 # OS specific
 # TODO: support Windows
@@ -187,7 +187,7 @@ clean:
 	rm -f *.o main quantize
 
 main: main.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o deps/libsentencepiece.a -o main $(LDFLAGS)
 	./main -h
 
 quantize: quantize.cpp ggml.o utils.o
diff --git a/README.md b/README.md
@@ -132,7 +132,7 @@ Here are the step for the LLaMA-7B model:
 # build this repo
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-make
+./build.sh
 
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
diff --git a/build.sh b/build.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+if [ ! -d deps ]
+then
+    mkdir deps
+fi
+cd deps
+if [ ! -f v0.1.97.tar.gz ]
+then
+    curl -LO https://github.com/google/sentencepiece/archive/refs/tags/v0.1.97.tar.gz
+fi
+if [ ! -f libsentencepiece.a ]
+then
+    tar xzvf v0.1.97.tar.gz
+    cd sentencepiece-0.1.97/ && rm -rf build && mkdir build && cd build && cmake ..
+    make sentencepiece-static -j $(nproc)
+    cd ../..
+    cp sentencepiece-0.1.97/build/src/libsentencepiece.a ./
+fi
+cd ..
+make
diff --git a/build_deps.sh b/build_deps.sh
diff --git a/main.cpp b/main.cpp
@@ -855,6 +855,8 @@ int main(int argc, char ** argv) {
     printf("\n\n");
 
     std::vector<gpt_vocab::id> embd;
+    std::vector<gpt_vocab::id> all_tokens;
+    std::string full_text = "";
 
     // determine the required inference memory per token:
     size_t mem_per_token = 0;
@@ -919,6 +921,7 @@ int main(int argc, char ** argv) {
 
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(id);
+                all_tokens.push_back(id);
 
                 t_sample_us += ggml_time_us() - t_start_sample_us;
             }
@@ -937,6 +940,7 @@ int main(int argc, char ** argv) {
                 embd.push_back(embd_inp[input_consumed]);
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(embd_inp[input_consumed]);
+                all_tokens.push_back(embd_inp[input_consumed]);
                 ++input_consumed;
                 if (embd.size() > params.n_batch) {
                     break;
@@ -949,8 +953,16 @@ int main(int argc, char ** argv) {
         }
 
         // display text
-        std::string text;
-        processor.Decode(all_tokens, &text);
+        std::string check = processor.IdToPiece(all_tokens.at(all_tokens.size()-1));
+        if(check != "�") {  // ensure a multi-byte token is finished generating before outputting the text
+            std::string text;
+            processor.Decode(all_tokens, &text);
+            std::string chunk = text.substr(full_text.length());
+            printf("%s", chunk.c_str());
+            full_text += chunk;
+            fflush(stdout);
+        }
+        
 
         // in interactive mode, and not currently processing queued inputs;
         // check if we should prompt the user for more

Original file line number	Diff line number	Diff line change
`@@ -21,3 +21,4 @@ models/*`
`21`	`21`
`22`	`22`	`arm_neon.h`
`23`	`23`	`compile_commands.json`
	`24`	`+deps`