77
77
78
78
using json = nlohmann::ordered_json;
79
79
80
+ //
81
+ // Environment variable utils
82
+ //
83
+
84
+ template <typename T>
85
+ static typename std::enable_if<std::is_same<T, std::string>::value, void >::type
86
+ get_env (std::string name, T & target) {
87
+ char * value = std::getenv (name.c_str ());
88
+ target = value ? std::string (value) : target;
89
+ }
90
+
91
+ template <typename T>
92
+ static typename std::enable_if<!std::is_same<T, bool >::value && std::is_integral<T>::value, void >::type
93
+ get_env (std::string name, T & target) {
94
+ char * value = std::getenv (name.c_str ());
95
+ target = value ? std::stoi (value) : target;
96
+ }
97
+
98
+ template <typename T>
99
+ static typename std::enable_if<std::is_floating_point<T>::value, void >::type
100
+ get_env (std::string name, T & target) {
101
+ char * value = std::getenv (name.c_str ());
102
+ target = value ? std::stof (value) : target;
103
+ }
104
+
105
+ template <typename T>
106
+ static typename std::enable_if<std::is_same<T, bool >::value, void >::type
107
+ get_env (std::string name, T & target) {
108
+ char * value = std::getenv (name.c_str ());
109
+ if (value) {
110
+ std::string val (value);
111
+ target = val == " 1" || val == " true" ;
112
+ }
113
+ }
114
+
80
115
//
81
116
// CPU utils
82
117
//
@@ -110,8 +145,34 @@ int32_t cpu_get_num_physical_cores() {
110
145
if (result == 0 ) {
111
146
return num_physical_cores;
112
147
}
113
- #elif defined(_WIN32)
114
- // TODO: Implement
148
+ #elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
149
+ // TODO: windows + arm64 + mingw64
150
+ unsigned int n_threads_win = std::thread::hardware_concurrency ();
151
+ unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2 ) : 4 ;
152
+
153
+ DWORD buffer_size = 0 ;
154
+ if (!GetLogicalProcessorInformationEx (RelationProcessorCore, nullptr , &buffer_size)) {
155
+ if (GetLastError () != ERROR_INSUFFICIENT_BUFFER) {
156
+ return default_threads;
157
+ }
158
+ }
159
+
160
+ std::vector<char > buffer (buffer_size);
161
+ if (!GetLogicalProcessorInformationEx (RelationProcessorCore, reinterpret_cast <PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data ()), &buffer_size)) {
162
+ return default_threads;
163
+ }
164
+
165
+ int32_t num_physical_cores = 0 ;
166
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast <PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data ());
167
+ while (buffer_size > 0 ) {
168
+ if (info->Relationship == RelationProcessorCore) {
169
+ num_physical_cores += info->Processor .GroupCount ;
170
+ }
171
+ buffer_size -= info->Size ;
172
+ info = reinterpret_cast <PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast <char *>(info) + info->Size );
173
+ }
174
+
175
+ return num_physical_cores > 0 ? num_physical_cores : default_threads;
115
176
#endif
116
177
unsigned int n_threads = std::thread::hardware_concurrency ();
117
178
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2 ) : 4 ;
@@ -194,12 +255,6 @@ int32_t cpu_get_num_math() {
194
255
// CLI argument parsing
195
256
//
196
257
197
- void gpt_params_handle_hf_token (gpt_params & params) {
198
- if (params.hf_token .empty () && std::getenv (" HF_TOKEN" )) {
199
- params.hf_token = std::getenv (" HF_TOKEN" );
200
- }
201
- }
202
-
203
258
void gpt_params_handle_model_default (gpt_params & params) {
204
259
if (!params.hf_repo .empty ()) {
205
260
// short-hand to avoid specifying --hf-file -> default it to --model
@@ -247,7 +302,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
247
302
248
303
gpt_params_handle_model_default (params);
249
304
250
- gpt_params_handle_hf_token (params);
305
+ if (params.hf_token .empty ()) {
306
+ get_env (" HF_TOKEN" , params.hf_token );
307
+ }
251
308
252
309
if (params.escape ) {
253
310
string_process_escapes (params.prompt );
@@ -267,6 +324,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
267
324
return true ;
268
325
}
269
326
327
+ void gpt_params_parse_from_env (gpt_params & params) {
328
+ // we only care about server-related params for now
329
+ get_env (" LLAMA_ARG_MODEL" , params.model );
330
+ get_env (" LLAMA_ARG_THREADS" , params.n_threads );
331
+ get_env (" LLAMA_ARG_CTX_SIZE" , params.n_ctx );
332
+ get_env (" LLAMA_ARG_N_PARALLEL" , params.n_parallel );
333
+ get_env (" LLAMA_ARG_BATCH" , params.n_batch );
334
+ get_env (" LLAMA_ARG_UBATCH" , params.n_ubatch );
335
+ get_env (" LLAMA_ARG_N_GPU_LAYERS" , params.n_gpu_layers );
336
+ get_env (" LLAMA_ARG_THREADS_HTTP" , params.n_threads_http );
337
+ get_env (" LLAMA_ARG_CHAT_TEMPLATE" , params.chat_template );
338
+ get_env (" LLAMA_ARG_N_PREDICT" , params.n_predict );
339
+ get_env (" LLAMA_ARG_ENDPOINT_METRICS" , params.endpoint_metrics );
340
+ get_env (" LLAMA_ARG_ENDPOINT_SLOTS" , params.endpoint_slots );
341
+ get_env (" LLAMA_ARG_EMBEDDINGS" , params.embedding );
342
+ get_env (" LLAMA_ARG_FLASH_ATTN" , params.flash_attn );
343
+ get_env (" LLAMA_ARG_DEFRAG_THOLD" , params.defrag_thold );
344
+ }
345
+
270
346
bool gpt_params_parse (int argc, char ** argv, gpt_params & params) {
271
347
const auto params_org = params; // the example can modify the default params
272
348
@@ -1727,7 +1803,13 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
1727
1803
if (params.n_threads_batch != -1 ) {
1728
1804
os << " (n_threads_batch = " << params.n_threads_batch << " )" ;
1729
1805
}
1806
+ #if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
1807
+ // TODO: windows + arm64 + mingw64
1808
+ DWORD logicalProcessorCount = GetActiveProcessorCount (ALL_PROCESSOR_GROUPS);
1809
+ os << " / " << logicalProcessorCount << " | " << llama_print_system_info ();
1810
+ #else
1730
1811
os << " / " << std::thread::hardware_concurrency () << " | " << llama_print_system_info ();
1812
+ #endif
1731
1813
1732
1814
return os.str ();
1733
1815
}
@@ -2702,12 +2784,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
2702
2784
return text;
2703
2785
}
2704
2786
2705
- bool llama_should_add_bos_token (const llama_model * model) {
2706
- const int add_bos = llama_add_bos_token (model);
2707
-
2708
- return add_bos != -1 ? bool (add_bos) : (llama_vocab_type (model) == LLAMA_VOCAB_TYPE_SPM);
2709
- }
2710
-
2711
2787
//
2712
2788
// Chat template utils
2713
2789
//
0 commit comments