6
6
#include < map>
7
7
#include < string>
8
8
9
- static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
10
- {" q4_0" , LLAMA_FTYPE_MOSTLY_Q4_0},
11
- {" q4_1" , LLAMA_FTYPE_MOSTLY_Q4_1},
12
- {" q4_2" , LLAMA_FTYPE_MOSTLY_Q4_2},
13
- {" q5_0" , LLAMA_FTYPE_MOSTLY_Q5_0},
14
- {" q5_1" , LLAMA_FTYPE_MOSTLY_Q5_1},
15
- {" q8_0" , LLAMA_FTYPE_MOSTLY_Q8_0},
9
+ static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
10
+ {" q4_0" , LLAMA_FTYPE_MOSTLY_Q4_0},
11
+ {" q4_1" , LLAMA_FTYPE_MOSTLY_Q4_1},
12
+ {" q4_2" , LLAMA_FTYPE_MOSTLY_Q4_2},
13
+ {" q5_0" , LLAMA_FTYPE_MOSTLY_Q5_0},
14
+ {" q5_1" , LLAMA_FTYPE_MOSTLY_Q5_1},
15
+ {" q8_0" , LLAMA_FTYPE_MOSTLY_Q8_0},
16
16
};
17
17
18
+ bool try_parse_ftype (const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
19
+ auto it = LLAMA_FTYPE_MAP.find (ftype_str);
20
+ if (it != LLAMA_FTYPE_MAP.end ()) {
21
+ ftype = it->second ;
22
+ ftype_str_out = it->first ;
23
+ return true ;
24
+ }
25
+ // try to parse as an integer
26
+ try {
27
+ int ftype_int = std::stoi (ftype_str);
28
+ for (auto it = LLAMA_FTYPE_MAP.begin (); it != LLAMA_FTYPE_MAP.end (); it++) {
29
+ if (it->second == ftype_int) {
30
+ ftype = it->second ;
31
+ ftype_str_out = it->first ;
32
+ return true ;
33
+ }
34
+ }
35
+ }
36
+ catch (...) {
37
+ // stoi failed
38
+ }
39
+ return false ;
40
+ }
41
+
18
42
// usage:
19
- // ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
43
+ // ./quantize models/llama/ggml-model.bin [ models/llama/ggml-model-quant.bin] type [nthreads]
20
44
//
21
45
int main (int argc, char ** argv) {
22
46
ggml_time_init ();
23
47
24
- if (argc < 4 ) {
25
- fprintf (stderr, " usage: %s model-f32.bin model-quant.bin type [nthread ]\n " , argv[0 ]);
48
+ if (argc < 3 ) {
49
+ fprintf (stderr, " usage: %s model-f32.bin [ model-quant.bin] type [nthreads ]\n " , argv[0 ]);
26
50
for (auto it = LLAMA_FTYPE_MAP.begin (); it != LLAMA_FTYPE_MAP.end (); it++) {
27
51
fprintf (stderr, " type = \" %s\" or %d\n " , it->first .c_str (), it->second );
28
52
}
@@ -36,24 +60,62 @@ int main(int argc, char ** argv) {
36
60
ggml_free (ctx);
37
61
}
38
62
63
+ // parse command line arguments
39
64
const std::string fname_inp = argv[1 ];
40
- const std::string fname_out = argv[2 ];
65
+ std::string fname_out;
66
+ int nthread;
67
+ llama_ftype ftype;
68
+
69
+ int arg_idx = 2 ;
70
+ std::string ftype_str;
71
+ if (try_parse_ftype (argv[arg_idx], ftype, ftype_str)) {
72
+ // argv[2] is the ftype
73
+ std::string fpath;
74
+ const size_t pos = fname_inp.find_last_of (' /' );
75
+ if (pos != std::string::npos) {
76
+ fpath = fname_inp.substr (0 , pos + 1 );
77
+ }
78
+ // export as [inp path]/ggml-model-[ftype].bin
79
+ fname_out = fpath + " ggml-model-" + ftype_str + " .bin" ;
80
+ arg_idx++;
81
+ }
82
+ else {
83
+ // argv[2] is the output path
84
+ fname_out = argv[arg_idx];
85
+ arg_idx++;
41
86
42
- enum llama_ftype ftype;
43
- if (argv[3 ][0 ] == ' q' ) {
44
- auto it = LLAMA_FTYPE_MAP.find (argv[3 ]);
45
- if (it == LLAMA_FTYPE_MAP.end ()) {
46
- fprintf (stderr, " %s: unknown ftype '%s'\n " , __func__, argv[3 ]);
87
+ if (argc <= arg_idx) {
88
+ fprintf (stderr, " %s: missing ftype\n " , __func__);
89
+ return 1 ;
90
+ }
91
+ // argv[3] is the ftype
92
+ if (!try_parse_ftype (argv[arg_idx], ftype, ftype_str)) {
93
+ fprintf (stderr, " %s: invalid ftype '%s'\n " , __func__, argv[3 ]);
94
+ return 1 ;
95
+ }
96
+ arg_idx++;
97
+ }
98
+
99
+ // parse nthreads
100
+ if (argc > arg_idx) {
101
+ try {
102
+ nthread = std::stoi (argv[arg_idx]);
103
+ }
104
+ catch (const std::exception & e) {
105
+ fprintf (stderr, " %s: invalid nthread '%s' (%s)\n " , __func__, argv[arg_idx], e.what ());
47
106
return 1 ;
48
107
}
49
- ftype = it->second ;
50
108
} else {
51
- ftype = ( enum llama_ftype) atoi (argv[ 3 ]) ;
109
+ nthread = 0 ;
52
110
}
53
111
54
112
fprintf (stderr, " %s: build = %d (%s)\n " , __func__, BUILD_NUMBER, BUILD_COMMIT);
55
113
56
- int nthread = argc > 4 ? atoi (argv[4 ]) : 0 ;
114
+ fprintf (stderr, " %s: quantizing '%s' to '%s' as %s" , __func__, fname_inp.c_str (), fname_out.c_str (), ftype_str.c_str ());
115
+ if (nthread > 0 ) {
116
+ fprintf (stderr, " using %d threads" , nthread);
117
+ }
118
+ fprintf (stderr, " \n " );
57
119
58
120
const int64_t t_main_start_us = ggml_time_us ();
59
121
0 commit comments