@@ -38,6 +38,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
38
38
} else if (arg == " -h" || arg == " --help" ) {
39
39
gpt_print_usage (argc, argv, params);
40
40
exit (0 );
41
+ } else if (arg == " -f" || arg == " --file" ) {
42
+ if (++i > argc) {
43
+ fprintf (stderr, " Invalid file param" );
44
+ break ;
45
+ }
46
+ std::ifstream file (argv[i]);
47
+ if (!file) {
48
+ fprintf (stderr, " error: failed to open file '%s'\n " , argv[i]);
49
+ break ;
50
+ }
51
+ std::copy (std::istreambuf_iterator<char >(file), std::istreambuf_iterator<char >(), back_inserter (params.prompt ));
52
+ if (params.prompt .back () == ' \n ' ) {
53
+ params.prompt .pop_back ();
54
+ }
41
55
} else {
42
56
fprintf (stderr, " error: unknown argument: %s\n " , arg.c_str ());
43
57
gpt_print_usage (argc, argv, params);
@@ -57,6 +71,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
57
71
fprintf (stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n " , params.n_threads );
58
72
fprintf (stderr, " -p PROMPT, --prompt PROMPT\n " );
59
73
fprintf (stderr, " prompt to start generation with (default: random)\n " );
74
+ fprintf (stderr, " -f FNAME, --file FNAME\n " );
75
+ fprintf (stderr, " load prompt from a file\n " );
60
76
fprintf (stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n " , params.n_predict );
61
77
fprintf (stderr, " --top_k N top-k sampling (default: %d)\n " , params.top_k );
62
78
fprintf (stderr, " --top_p N top-p sampling (default: %.1f)\n " , params.top_p );
@@ -192,6 +208,10 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
192
208
return result;
193
209
}
194
210
211
+ void gpt_vocab::add_special_token (const std::string & token) {
212
+ special_tokens.push_back (token);
213
+ }
214
+
195
215
std::vector<gpt_vocab::id> gpt_tokenize (const gpt_vocab & vocab, const std::string & text) {
196
216
std::vector<std::string> words;
197
217
@@ -200,6 +220,20 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
200
220
std::string str = text;
201
221
std::string pat = R"( 's|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" ;
202
222
223
+ // Generate the subpattern from the special_tokens vector if it's not empty
224
+ if (!vocab.special_tokens .empty ()) {
225
+ std::string special_tokens_subpattern;
226
+ for (const auto & token : vocab.special_tokens ) {
227
+ if (!special_tokens_subpattern.empty ()) {
228
+ special_tokens_subpattern += " |" ;
229
+ }
230
+ special_tokens_subpattern += token;
231
+ }
232
+
233
+ // Modify the regex pattern with the generated special tokens subpattern
234
+ pat = special_tokens_subpattern + " |" + pat;
235
+ }
236
+
203
237
std::regex re (pat);
204
238
std::smatch m;
205
239
0 commit comments