Server: enable lookup decoding

JohannesGaessler · JohannesGaessler · commit 87e5656bab67 · 2024-04-22T14:27:21.000+02:00
diff --git a/Makefile b/Makefile
@@ -800,7 +800,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o ngram-cache.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
 
diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp
@@ -216,12 +216,11 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
 
 }
 
-llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
+bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename) {
     std::ifstream hashmap_file(filename, std::ios::binary);
     if (!hashmap_file) {
-        throw std::ifstream::failure("Unable to open file " + filename);
+        return false;
     }
-    llama_ngram_cache ngram_cache;
 
     llama_ngram ngram;
     int32_t     ntokens;
@@ -251,7 +250,7 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
     }
     GGML_ASSERT(hashmap_file.eof());
 
-    return ngram_cache;
+    return true;
 }
 
 void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
diff --git a/common/ngram-cache.h b/common/ngram-cache.h
@@ -84,9 +84,10 @@ void llama_ngram_cache_draft(
 void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
 
 // Load an ngram cache saved with llama_ngram_cache_save.
+// ngram_cache: the ngram cache to load the data into.
 // filename: the path from which to load the ngram cache.
 // returns:  an ngram cache containing the information saved to filename.
-llama_ngram_cache llama_ngram_cache_load(std::string & filename);
+bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename);
 
 // Merge two ngram caches.
 // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp
@@ -33,11 +33,13 @@ int main(int argc, char ** argv){
     }
 
     fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
-    llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
+    llama_ngram_cache ngram_cache_merged;
+    GGML_ASSERT(llama_ngram_cache_load(ngram_cache_merged, args[0]));
 
     for (size_t i = 1; i < args.size()-1; ++i) {
         fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
-        llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
+        llama_ngram_cache ngram_cache;
+        GGML_ASSERT(llama_ngram_cache_load(ngram_cache, args[i]));
 
         llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
     }
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
@@ -47,18 +47,15 @@ int main(int argc, char ** argv){
         const int64_t t_start_draft_us = ggml_time_us();
 
         if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
+            if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) {
                 fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
             }
         }
 
         if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+            // If the dynamic lookup cache doesn't exist it will be created at the end of the program:
+            llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic);
         }
 
         t_draft_flat_us += ggml_time_us() - t_start_draft_us;
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
@@ -57,18 +57,15 @@ int main(int argc, char ** argv){
         llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
 
         if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
+            if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) {
                 fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
             }
         }
 
         if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+            // If the dynamic lookup cache doesn't exist it will be created at the end of the program:
+            llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic);
         }
 
         t_draft_flat_us += ggml_time_us() - t_start_draft_us;
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
@@ -45,6 +45,9 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
     parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
     parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
+    parser.add_argument("--draft", type=int, help="Max. number of additional tokens to draft for lookup decoding", required=False, default=5)
+    parser.add_argument("-lcs", "--lookup-cache-static", type=str, help="Path to optional static lookup cache to use.", required=False, default=None)
+    parser.add_argument("-lcd", "--lookup-cache-dynamic", type=str, help="Path to optional dynamic lookup cache to use. Will be overwritten upon server shutdown.", required=False, default=None)
 
     args = parser.parse_args(args_in)
 
@@ -269,6 +272,11 @@ def start_server_background(args):
     server_args.append('--cont-batching')
     server_args.append('--metrics')
     server_args.extend(['--log-format', "text"])
+    server_args.extend(['--draft', args.draft])
+    if args.lookup_cache_static is not None:
+        server_args.extend(['--lookup-cache-static', args.lookup_cache_static])
+    if args.lookup_cache_dynamic is not None:
+        server_args.extend(['--lookup-cache-dynamic', args.lookup_cache_dynamic])
     args = [str(arg) for arg in [server_path, *server_args]]
     print(f"bench: starting server with: {' '.join(args)}")
     pkwargs = {
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

Original file line number	Diff line number	Diff line change
`@@ -216,12 +216,11 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen`
`216`	`216`
`217`	`217`	`}`
`218`	`218`
`219`		`-llama_ngram_cache llama_ngram_cache_load(std::string & filename) {`
	`219`	`+bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename) {`
`220`	`220`	`std::ifstream hashmap_file(filename, std::ios::binary);`
`221`	`221`	`if (!hashmap_file) {`
`222`		`- throw std::ifstream::failure("Unable to open file " + filename);`
	`222`	`+ return false;`
`223`	`223`	`}`
`224`		`- llama_ngram_cache ngram_cache;`
`225`	`224`
`226`	`225`	`llama_ngram ngram;`
`227`	`226`	`int32_t ntokens;`
`@@ -251,7 +250,7 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {`
`251`	`250`	`}`
`252`	`251`	`GGML_ASSERT(hashmap_file.eof());`
`253`	`252`
`254`		`- return ngram_cache;`
	`253`	`+ return true;`
`255`	`254`	`}`
`256`	`255`
`257`	`256`	`void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {`
Original file line number	Diff line number	Diff line change
`@@ -33,11 +33,13 @@ int main(int argc, char ** argv){`
`33`	`33`	`}`
`34`	`34`
`35`	`35`	`fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());`
`36`		`- llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);`
	`36`	`+ llama_ngram_cache ngram_cache_merged;`
	`37`	`+ GGML_ASSERT(llama_ngram_cache_load(ngram_cache_merged, args[0]));`
`37`	`38`
`38`	`39`	`for (size_t i = 1; i < args.size()-1; ++i) {`
`39`	`40`	`fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());`
`40`		`- llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);`
	`41`	`+ llama_ngram_cache ngram_cache;`
	`42`	`+ GGML_ASSERT(llama_ngram_cache_load(ngram_cache, args[i]));`
`41`	`43`
`42`	`44`	`llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);`
`43`	`45`	`}`
Original file line number	Diff line number	Diff line change
`@@ -47,18 +47,15 @@ int main(int argc, char ** argv){`
`47`	`47`	`const int64_t t_start_draft_us = ggml_time_us();`
`48`	`48`
`49`	`49`	`if (!params.lookup_cache_static.empty()) {`
`50`		`- try {`
`51`		`- ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);`
`52`		`- } catch (std::ifstream::failure const &) {`
	`50`	`+ if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) {`
`53`	`51`	`fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());`
`54`	`52`	`exit(1);`
`55`	`53`	`}`
`56`	`54`	`}`
`57`	`55`
`58`	`56`	`if (!params.lookup_cache_dynamic.empty()) {`
`59`		`- try {`
`60`		`- ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);`
`61`		`- } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program`
	`57`	`+ // If the dynamic lookup cache doesn't exist it will be created at the end of the program:`
	`58`	`+ llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic);`
`62`	`59`	`}`
`63`	`60`
`64`	`61`	`t_draft_flat_us += ggml_time_us() - t_start_draft_us;`
Original file line number	Diff line number	Diff line change
`@@ -57,18 +57,15 @@ int main(int argc, char ** argv){`
`57`	`57`	`llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);`
`58`	`58`
`59`	`59`	`if (!params.lookup_cache_static.empty()) {`
`60`		`- try {`
`61`		`- ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);`
`62`		`- } catch (std::ifstream::failure const &) {`
	`60`	`+ if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) {`
`63`	`61`	`fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());`
`64`	`62`	`exit(1);`
`65`	`63`	`}`
`66`	`64`	`}`
`67`	`65`
`68`	`66`	`if (!params.lookup_cache_dynamic.empty()) {`
`69`		`- try {`
`70`		`- ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);`
`71`		`- } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program`
	`67`	`+ // If the dynamic lookup cache doesn't exist it will be created at the end of the program:`
	`68`	`+ llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic);`
`72`	`69`	`}`
`73`	`70`
`74`	`71`	`t_draft_flat_us += ggml_time_us() - t_start_draft_us;`