Skip to content

Commit ff24af6

Browse files
phymbertggerganov
authored andcommitted
common: llama_load_model_from_url split support (ggml-org#6192)
* llama: llama_split_prefix fix strncpy does not include string termination common: llama_load_model_from_url: - fix header name case sensitive - support downloading additional split in parallel - hide password in url * common: EOL EOF * common: remove redundant LLAMA_CURL_MAX_PATH_LENGTH definition * common: change max url max length * common: minor comment * server: support HF URL options * llama: llama_model_loader fix log * common: use a constant for max url length * common: clean up curl if file cannot be loaded in gguf * server: tests: add split tests, and HF options params * common: move llama_download_hide_password_in_url inside llama_download_file as a lambda * server: tests: enable back Release test on PR * spacing Co-authored-by: Georgi Gerganov <[email protected]> * spacing Co-authored-by: Georgi Gerganov <[email protected]> * spacing Co-authored-by: Georgi Gerganov <[email protected]> --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 44816bc commit ff24af6

File tree

10 files changed

+199
-62
lines changed

10 files changed

+199
-62
lines changed

.github/workflows/server.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ jobs:
3535
include:
3636
- build_type: Release
3737
sanitizer: ""
38-
disabled_on_pr: true
3938
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
4039

4140
container:

common/common.cpp

Lines changed: 157 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939
#endif
4040
#if defined(LLAMA_USE_CURL)
4141
#include <curl/curl.h>
42+
#include <curl/easy.h>
43+
#include <thread>
44+
#include <future>
4245
#endif
4346

4447
#if defined(_MSC_VER)
@@ -61,7 +64,7 @@
6164
#else
6265
#include <sys/syslimits.h>
6366
#endif
64-
#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
67+
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
6568
#define LLAMA_CURL_MAX_HEADER_LENGTH 256
6669
#endif // LLAMA_USE_CURL
6770

@@ -1702,27 +1705,13 @@ void llama_batch_add(
17021705

17031706
#ifdef LLAMA_USE_CURL
17041707

1705-
struct llama_model * llama_load_model_from_url(
1706-
const char * model_url,
1707-
const char * path_model,
1708-
const struct llama_model_params & params) {
1709-
// Basic validation of the model_url
1710-
if (!model_url || strlen(model_url) == 0) {
1711-
fprintf(stderr, "%s: invalid model_url\n", __func__);
1712-
return NULL;
1713-
}
1714-
1715-
// Initialize libcurl globally
1716-
auto curl = curl_easy_init();
1717-
1718-
if (!curl) {
1719-
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
1720-
return NULL;
1721-
}
1708+
static bool llama_download_file(CURL * curl, const char * url, const char * path) {
1709+
bool force_download = false;
17221710

17231711
// Set the URL, allow to follow http redirection
1724-
curl_easy_setopt(curl, CURLOPT_URL, model_url);
1712+
curl_easy_setopt(curl, CURLOPT_URL, url);
17251713
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
1714+
17261715
#if defined(_WIN32)
17271716
// CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
17281717
// operating system. Currently implemented under MS-Windows.
@@ -1731,24 +1720,24 @@ struct llama_model * llama_load_model_from_url(
17311720

17321721
// Check if the file already exists locally
17331722
struct stat model_file_info;
1734-
auto file_exists = (stat(path_model, &model_file_info) == 0);
1723+
auto file_exists = (stat(path, &model_file_info) == 0);
17351724

17361725
// If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
17371726
char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
1738-
char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
1739-
snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
1727+
char etag_path[PATH_MAX] = {0};
1728+
snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
17401729

17411730
char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
1742-
char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
1743-
snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
1731+
char last_modified_path[PATH_MAX] = {0};
1732+
snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
17441733

17451734
if (file_exists) {
17461735
auto * f_etag = fopen(etag_path, "r");
17471736
if (f_etag) {
17481737
if (!fgets(etag, sizeof(etag), f_etag)) {
17491738
fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
17501739
} else {
1751-
fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
1740+
fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
17521741
}
17531742
fclose(f_etag);
17541743
}
@@ -1758,7 +1747,7 @@ struct llama_model * llama_load_model_from_url(
17581747
if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
17591748
fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
17601749
} else {
1761-
fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
1750+
fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
17621751
last_modified);
17631752
}
17641753
fclose(f_last_modified);
@@ -1776,6 +1765,11 @@ struct llama_model * llama_load_model_from_url(
17761765
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
17771766
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
17781767

1768+
// Convert header field name to lowercase
1769+
for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
1770+
buffer[i] = tolower(buffer[i]);
1771+
}
1772+
17791773
const char * etag_prefix = "etag: ";
17801774
if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
17811775
strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
@@ -1798,38 +1792,42 @@ struct llama_model * llama_load_model_from_url(
17981792
if (res != CURLE_OK) {
17991793
curl_easy_cleanup(curl);
18001794
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
1801-
return NULL;
1795+
return false;
18021796
}
18031797

18041798
long http_code = 0;
18051799
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
18061800
if (http_code != 200) {
18071801
// HEAD not supported, we don't know if the file has changed
18081802
// force trigger downloading
1809-
file_exists = false;
1803+
force_download = true;
18101804
fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
18111805
}
18121806
}
18131807

18141808
// If the ETag or the Last-Modified headers are different: trigger a new download
1815-
if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
1816-
char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
1817-
snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
1809+
bool should_download = !file_exists
1810+
|| force_download
1811+
|| (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
1812+
|| (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
1813+
if (should_download) {
1814+
char path_temporary[PATH_MAX] = {0};
1815+
snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
18181816
if (file_exists) {
1819-
fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
1820-
if (remove(path_model) != 0) {
1817+
fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
1818+
if (remove(path) != 0) {
18211819
curl_easy_cleanup(curl);
1822-
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
1823-
return NULL;
1820+
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
1821+
return false;
18241822
}
18251823
}
18261824

18271825
// Set the output file
1828-
auto * outfile = fopen(path_model_temporary, "wb");
1826+
auto * outfile = fopen(path_temporary, "wb");
18291827
if (!outfile) {
18301828
curl_easy_cleanup(curl);
1831-
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
1832-
return NULL;
1829+
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
1830+
return false;
18331831
}
18341832

18351833
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
@@ -1843,15 +1841,30 @@ struct llama_model * llama_load_model_from_url(
18431841
// display download progress
18441842
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
18451843

1844+
// helper function to hide password in URL
1845+
auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
1846+
std::size_t protocol_pos = url.find("://");
1847+
if (protocol_pos == std::string::npos) {
1848+
return url; // Malformed URL
1849+
}
1850+
1851+
std::size_t at_pos = url.find('@', protocol_pos + 3);
1852+
if (at_pos == std::string::npos) {
1853+
return url; // No password in URL
1854+
}
1855+
1856+
return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
1857+
};
1858+
18461859
// start the download
1847-
fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
1848-
model_url, path_model, headers.etag, headers.last_modified);
1860+
fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
1861+
llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
18491862
auto res = curl_easy_perform(curl);
18501863
if (res != CURLE_OK) {
18511864
fclose(outfile);
18521865
curl_easy_cleanup(curl);
18531866
fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
1854-
return NULL;
1867+
return false;
18551868
}
18561869

18571870
long http_code = 0;
@@ -1860,7 +1873,7 @@ struct llama_model * llama_load_model_from_url(
18601873
fclose(outfile);
18611874
curl_easy_cleanup(curl);
18621875
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
1863-
return NULL;
1876+
return false;
18641877
}
18651878

18661879
// Clean up
@@ -1872,7 +1885,7 @@ struct llama_model * llama_load_model_from_url(
18721885
if (etag_file) {
18731886
fputs(headers.etag, etag_file);
18741887
fclose(etag_file);
1875-
fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
1888+
fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
18761889
}
18771890
}
18781891

@@ -1882,20 +1895,118 @@ struct llama_model * llama_load_model_from_url(
18821895
if (last_modified_file) {
18831896
fputs(headers.last_modified, last_modified_file);
18841897
fclose(last_modified_file);
1885-
fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
1898+
fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
18861899
headers.last_modified);
18871900
}
18881901
}
18891902

1890-
if (rename(path_model_temporary, path_model) != 0) {
1903+
if (rename(path_temporary, path) != 0) {
1904+
curl_easy_cleanup(curl);
1905+
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
1906+
return false;
1907+
}
1908+
}
1909+
1910+
return true;
1911+
}
1912+
1913+
struct llama_model * llama_load_model_from_url(
1914+
const char * model_url,
1915+
const char * path_model,
1916+
const struct llama_model_params & params) {
1917+
// Basic validation of the model_url
1918+
if (!model_url || strlen(model_url) == 0) {
1919+
fprintf(stderr, "%s: invalid model_url\n", __func__);
1920+
return NULL;
1921+
}
1922+
1923+
// Initialize libcurl
1924+
auto * curl = curl_easy_init();
1925+
1926+
if (!curl) {
1927+
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
1928+
return NULL;
1929+
}
1930+
1931+
if (!curl) {
1932+
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
1933+
return NULL;
1934+
}
1935+
1936+
if (!llama_download_file(curl, model_url, path_model)) {
1937+
return NULL;
1938+
}
1939+
1940+
// check for additional GGUFs split to download
1941+
int n_split = 0;
1942+
{
1943+
struct gguf_init_params gguf_params = {
1944+
/*.no_alloc = */ true,
1945+
/*.ctx = */ NULL,
1946+
};
1947+
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
1948+
if (!ctx_gguf) {
1949+
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
18911950
curl_easy_cleanup(curl);
1892-
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
18931951
return NULL;
18941952
}
1953+
1954+
auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
1955+
if (key_n_split >= 0) {
1956+
n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
1957+
}
1958+
1959+
gguf_free(ctx_gguf);
18951960
}
18961961

18971962
curl_easy_cleanup(curl);
18981963

1964+
if (n_split > 1) {
1965+
char split_prefix[PATH_MAX] = {0};
1966+
char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1967+
1968+
// Verify the first split file format
1969+
// and extract split URL and PATH prefixes
1970+
{
1971+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
1972+
fprintf(stderr, "\n%s: unexpected model file name: %s"
1973+
" n_split=%d\n", __func__, path_model, n_split);
1974+
return NULL;
1975+
}
1976+
1977+
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
1978+
fprintf(stderr, "\n%s: unexpected model url: %s"
1979+
" n_split=%d\n", __func__, model_url, n_split);
1980+
return NULL;
1981+
}
1982+
}
1983+
1984+
// Prepare download in parallel
1985+
std::vector<std::future<bool>> futures_download;
1986+
for (int idx = 1; idx < n_split; idx++) {
1987+
futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
1988+
char split_path[PATH_MAX] = {0};
1989+
llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
1990+
1991+
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
1992+
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
1993+
1994+
auto * curl = curl_easy_init();
1995+
bool res = llama_download_file(curl, split_url, split_path);
1996+
curl_easy_cleanup(curl);
1997+
1998+
return res;
1999+
}, idx));
2000+
}
2001+
2002+
// Wait for all downloads to complete
2003+
for (auto & f : futures_download) {
2004+
if (!f.get()) {
2005+
return NULL;
2006+
}
2007+
}
2008+
}
2009+
18992010
return llama_load_model_from_file(path_model, params);
19002011
}
19012012

common/common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,3 +306,10 @@ struct llama_control_vector_load_info {
306306
// Load control vectors, scale each by strength, and add them together.
307307
// On error, returns {-1, empty}
308308
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
309+
310+
//
311+
// Split utils
312+
//
313+
static const char * const LLM_KV_SPLIT_NO = "split.no";
314+
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
315+
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

examples/gguf-split/gguf-split.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,6 @@ enum split_operation : uint8_t {
2626
SPLIT_OP_MERGE,
2727
};
2828

29-
static const char * const LLM_KV_SPLIT_NO = "split.no";
30-
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
31-
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
32-
3329
struct split_params {
3430
split_operation operation = SPLIT_OP_SPLIT;
3531
int n_split_tensors = 128;

examples/server/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ The project is under active development, and we are [looking for feedback and co
2020
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
2121
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
2222
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
23-
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
23+
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused).
24+
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused).
25+
- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
2426
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
2527
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
2628
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.

0 commit comments

Comments
 (0)